Date: Sunday, October 31, 2021 @ 22:49:00 Author: kgizdov Revision: 1035135
upgpkg: python-pytorch 1.10.0-1: upgpkg: python-pytorch 1.10.0-1 Added: python-pytorch/trunk/66219.patch python-pytorch/trunk/fix-jit-frontend-nullptr-deref.patch python-pytorch/trunk/fix_old_nnapi_lite_interpreter_config.patch Modified: python-pytorch/trunk/PKGBUILD python-pytorch/trunk/fix-building-for-torchvision.patch python-pytorch/trunk/fix_c10.patch Deleted: python-pytorch/trunk/benchmark-gcc11.patch python-pytorch/trunk/disable_non_x86_64.patch python-pytorch/trunk/xnnpack-gcc11.patch ---------------------------------------------+ 66219.patch | 3002 ++++++++++++++++++++++++++ PKGBUILD | 135 - benchmark-gcc11.patch | 30 disable_non_x86_64.patch | 15 fix-building-for-torchvision.patch | 6 fix-jit-frontend-nullptr-deref.patch | 12 fix_c10.patch | 2 fix_old_nnapi_lite_interpreter_config.patch | 33 xnnpack-gcc11.patch | 33 9 files changed, 3115 insertions(+), 153 deletions(-) Added: 66219.patch =================================================================== --- 66219.patch (rev 0) +++ 66219.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -0,0 +1,3002 @@ +From 55473c81535db8890d35e29cff852b737954ce80 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Wed, 6 Oct 2021 11:52:58 -0700 +Subject: [PATCH 01/30] Refactor cub namespace + +--- + aten/src/ATen/cuda/cub.cuh | 102 +++++++++--------- + aten/src/ATen/cuda/cub_definitions.cuh | 17 +++ + caffe2/core/context_gpu.cu | 1 + + caffe2/operators/accuracy_op.cu | 1 + + caffe2/operators/affine_channel_op.cu | 1 + + caffe2/operators/arg_ops.cu | 2 +- + caffe2/operators/batch_moments_op.cu | 1 + + caffe2/operators/batch_sparse_to_dense_op.cu | 1 + + caffe2/operators/boolean_mask_ops.cu | 2 +- + caffe2/operators/cross_entropy_op.cu | 1 + + caffe2/operators/distance_op.cu | 1 + + caffe2/operators/elementwise_div_op.cu | 2 +- + caffe2/operators/elementwise_linear_op.cu | 1 + + caffe2/operators/elementwise_mul_op.cu | 2 +- + caffe2/operators/elementwise_ops.cu | 1 + + caffe2/operators/find_op.cu | 1 + + caffe2/operators/generate_proposals_op.cu | 2 +- + caffe2/operators/normalize_ops.cu | 1 + + caffe2/operators/one_hot_ops.cu | 1 + + caffe2/operators/pack_segments.cu | 2 +- + caffe2/operators/prelu_op.cu | 1 + + caffe2/operators/reduce_front_back_max_ops.cu | 1 + + .../reduce_front_back_sum_mean_ops.cu | 1 + + caffe2/operators/reduction_ops.cu | 2 +- + caffe2/operators/rmac_regions_op.cu | 7 ++ + caffe2/operators/segment_reduction_op_gpu.cuh | 1 + + caffe2/operators/sequence_ops.cu | 2 +- + caffe2/operators/softmax_ops.cu | 1 + + .../operators/spatial_batch_norm_op_impl.cuh | 2 +- + caffe2/sgd/adagrad_fused_op_gpu.cu | 1 + + caffe2/sgd/adagrad_op_gpu.cu | 1 + + caffe2/sgd/adam_op_gpu.cu | 1 + + caffe2/utils/math/reduce.cu | 2 +- + caffe2/utils/math/reduce.cuh | 2 +- + caffe2/utils/math_gpu.cu | 2 +- + cmake/Dependencies.cmake | 5 + + 36 files changed, 115 insertions(+), 60 deletions(-) + create mode 100644 aten/src/ATen/cuda/cub_definitions.cuh + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 5d8ae777ebef..39938efc48be 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -5,16 +5,24 @@ + #include <iterator> + #include <limits> + +-// include cub in a safe manner, see: +-// https://github.com/pytorch/pytorch/pull/55292 ++#include <ATen/cuda/cub_definitions.cuh> ++ ++#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++ ++#include <cub/cub.cuh> ++ ++#else ++ + #undef CUB_NS_POSTFIX //undef to avoid redefinition warnings + #undef CUB_NS_PREFIX +-#define CUB_NS_PREFIX namespace at { namespace cuda { namespace detail { +-#define CUB_NS_POSTFIX }}} ++#define CUB_NS_PREFIX namespace at_cuda_detail { ++#define CUB_NS_POSTFIX } + #include <cub/cub.cuh> + #undef CUB_NS_POSTFIX + #undef CUB_NS_PREFIX + ++#endif ++ + #include <ATen/cuda/Exceptions.h> + #include <c10/cuda/CUDACachingAllocator.h> + #include <c10/cuda/CUDAStream.h> +@@ -33,16 +41,41 @@ + #define NO_ROCM(x) + #else + #define NO_ROCM(x) x ++#endif + +-namespace at { namespace native { ++#if !defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16() + +-namespace cub = at::cuda::detail::cub; ++namespace at_cuda_detail { ++// backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16 + +-}} ++template <> ++struct cub::FpLimits<c10::BFloat16> ++{ ++ static __host__ __device__ __forceinline__ c10::BFloat16 Max() { ++ unsigned short max_word = 0x7F7F; ++ return reinterpret_cast<c10::BFloat16&>(max_word); ++ } ++ ++ static __host__ __device__ __forceinline__ c10::BFloat16 Lowest() { ++ unsigned short lowest_word = 0xFF7F; ++ return reinterpret_cast<c10::BFloat16&>(lowest_word); ++ } ++}; ++ ++template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {}; ++} + #endif + ++namespace at { namespace native { ++namespace cub = at_cuda_detail::cub; ++}} ++namespace caffe2 { ++namespace cub = at_cuda_detail::cub; ++} ++ + namespace at { + namespace cuda { ++namespace cub { + + namespace detail { + +@@ -55,44 +88,17 @@ struct cuda_type<c10::Half> { + using type = __half; + }; + +-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11050 +-// cub sort support for __nv_bfloat16 is added to cub 1.13 in +-// https://github.com/NVIDIA/cub/pull/306 and according to +-// https://github.com/NVIDIA/cub#releases, 1.13 is included in +-// CUDA Toolkit 11.5 ++#if CUB_SUPPORTS_NV_BFLOAT16() + +-// waiting for https://github.com/NVIDIA/cub/pull/306 to land on CUDA + template<> + struct cuda_type<c10::BFloat16> { + using type = __nv_bfloat16; + }; + +-#elif !defined(__HIP_PLATFORM_HCC__) +- +-// backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16 +- +-template <> +-struct cub::FpLimits<c10::BFloat16> +-{ +- static __host__ __device__ __forceinline__ c10::BFloat16 Max() { +- unsigned short max_word = 0x7F7F; +- return reinterpret_cast<c10::BFloat16&>(max_word); +- } +- +- static __host__ __device__ __forceinline__ c10::BFloat16 Lowest() { +- unsigned short lowest_word = 0xFF7F; +- return reinterpret_cast<c10::BFloat16&>(lowest_word); +- } +-}; +- +-template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {}; +- + #endif + + } // namespace detail + +-namespace cub { +- + inline int get_num_bits(uint64_t max_key) { + int num_bits = 1; + while (max_key > 1) { +@@ -115,11 +121,11 @@ static inline void sort_keys( + key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out); + + if (descending) { +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortKeysDescending, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortKeysDescending, + keys_in_, keys_out_, n, + begin_bit, end_bit, c10::cuda::getCurrentCUDAStream()); + } else { +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortKeys, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortKeys, + keys_in_, keys_out_, n, + begin_bit, end_bit, c10::cuda::getCurrentCUDAStream()); + } +@@ -147,11 +153,11 @@ static inline void sort_pairs( + key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out); + + if (descending) { +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortPairsDescending, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairsDescending, + keys_in_, keys_out_, values_in, values_out, n, + begin_bit, end_bit, c10::cuda::getCurrentCUDAStream()); + } else { +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortPairs, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairs, + keys_in_, keys_out_, values_in, values_out, n, + begin_bit, end_bit, c10::cuda::getCurrentCUDAStream()); + } +@@ -183,12 +189,12 @@ static inline void segmented_sort_pairs( + key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out); + + if (descending) { +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSegmentedRadixSort::SortPairsDescending, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSegmentedRadixSort::SortPairsDescending, + keys_in_, keys_out_, values_in, values_out, + num_elements, num_segments, begin_offsets, end_offsets, + begin_bit, end_bit, c10::cuda::getCurrentCUDAStream()); + } else { +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSegmentedRadixSort::SortPairs, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSegmentedRadixSort::SortPairs, + keys_in_, keys_out_, values_in, values_out, + num_elements, num_segments, begin_offsets, end_offsets, + begin_bit, end_bit, c10::cuda::getCurrentCUDAStream()); +@@ -240,7 +246,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT + // so split at int_max/2 + constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30 + int size_cub = std::min<int64_t>(num_items, max_cub_size); +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::InclusiveScan, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan, + input, + output, + scan_op, +@@ -260,7 +266,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT + first_elem_ptr, + scan_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +- using ArgIndexInputIterator = NO_ROCM(detail)::cub::ArgIndexInputIterator<InputIteratorT>; ++ using ArgIndexInputIterator = NO_ROCM(at_cuda_detail)::cub::ArgIndexInputIterator<InputIteratorT>; + using tuple = typename ArgIndexInputIterator::value_type; + auto input_iter_transform = [=] __device__ (const tuple &x)->input_t { + if (x.key == 0) { +@@ -269,9 +275,9 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT + return x.value; + } + }; +- auto input_ = NO_ROCM(detail)::cub::TransformInputIterator<input_t, decltype(input_iter_transform), ArgIndexInputIterator>( ++ auto input_ = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<input_t, decltype(input_iter_transform), ArgIndexInputIterator>( + ArgIndexInputIterator(input + i), input_iter_transform); +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::InclusiveScan, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan, + input_, + output + i, + scan_op, +@@ -287,7 +293,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT + // so split at int_max/2 + constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30 + int size_cub = std::min<int64_t>(num_items, max_cub_size); +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::ExclusiveScan, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan, + input, + output, + scan_op, +@@ -309,7 +315,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT + C10_CUDA_KERNEL_LAUNCH_CHECK(); + auto input_ = impl::chained_iterator<InitValueT, InputIteratorT>{ + input + i, first_elem_ptr}; +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::InclusiveScan, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan, + input_, + output + i, + scan_op, +@@ -322,7 +328,7 @@ template<typename InputIteratorT , typename OutputIteratorT , typename NumSelect + inline void unique(InputIteratorT input, OutputIteratorT output, NumSelectedIteratorT num_selected_out, int64_t num_items) { + TORCH_CHECK(num_items <= std::numeric_limits<int>::max(), + "cub unique does not support more than INT_MAX elements"); +- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSelect::Unique, ++ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::Unique, + input, output, num_selected_out, num_items, at::cuda::getCurrentCUDAStream()); + } + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +new file mode 100644 +index 000000000000..a7694fda4168 +--- /dev/null ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -0,0 +1,17 @@ ++#include <cub/version.cuh> ++ ++// cub sort support for __nv_bfloat16 is added to cub 1.13 in: ++// https://github.com/NVIDIA/cub/pull/306 ++#if CUB_VERSION >= 101300 ++#define CUB_SUPPORTS_NV_BFLOAT16() true ++#elif ++#define CUB_SUPPORTS_NV_BFLOAT16() false ++#endif ++ ++// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in: ++// https://github.com/NVIDIA/cub/pull/326 ++#if CUB_VERSION >= 101400 ++#define CUB_SUPPORTS_WRAPPED_NAMESPACE() true ++#elif ++#define CUB_SUPPORTS_WRAPPED_NAMESPACE() false ++#endif +\ No newline at end of file +diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu +index c2b89945ada9..475ed61ab4f7 100644 +--- a/caffe2/core/context_gpu.cu ++++ b/caffe2/core/context_gpu.cu +@@ -4,6 +4,7 @@ + #include <string> + #include <unordered_map> + ++#include <ATen/cuda/cub.cuh> + #include <ATen/Context.h> + #include <c10/cuda/CUDAFunctions.h> + #include <c10/cuda/CUDACachingAllocator.h> +diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu +index f06663d71a90..7ad2b09c238a 100644 +--- a/caffe2/operators/accuracy_op.cu ++++ b/caffe2/operators/accuracy_op.cu +@@ -3,6 +3,7 @@ + #include "caffe2/utils/GpuAtomics.cuh" + #include "caffe2/utils/math.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu +index adf4ac55c0fc..f3d9e22c7e8c 100644 +--- a/caffe2/operators/affine_channel_op.cu ++++ b/caffe2/operators/affine_channel_op.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/affine_channel_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu +index 7e90d25b836b..fbefe0774376 100644 +--- a/caffe2/operators/arg_ops.cu ++++ b/caffe2/operators/arg_ops.cu +@@ -2,8 +2,8 @@ + + #include <limits> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu +index 4b693b5c04e2..65c43200e5bd 100644 +--- a/caffe2/operators/batch_moments_op.cu ++++ b/caffe2/operators/batch_moments_op.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/batch_moments_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu +index aea2035a5d3d..2cb09deb8668 100644 +--- a/caffe2/operators/batch_sparse_to_dense_op.cu ++++ b/caffe2/operators/batch_sparse_to_dense_op.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/batch_sparse_to_dense_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/device/device_scan.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu +index 214b7c13ba3c..c87688f51d64 100644 +--- a/caffe2/operators/boolean_mask_ops.cu ++++ b/caffe2/operators/boolean_mask_ops.cu +@@ -3,7 +3,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/boolean_mask_ops.h" + +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + + namespace caffe2 { + +diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu +index 380e80399fc3..95f3ffddbf1f 100644 +--- a/caffe2/operators/cross_entropy_op.cu ++++ b/caffe2/operators/cross_entropy_op.cu +@@ -1,4 +1,5 @@ + #include <assert.h> ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu +index 3a8bb337d541..d94691d5a9d9 100644 +--- a/caffe2/operators/distance_op.cu ++++ b/caffe2/operators/distance_op.cu +@@ -4,6 +4,7 @@ + #include "caffe2/operators/distance_op.h" + #include "caffe2/utils/conversions.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu +index 42b103a0f110..ca9682326324 100644 +--- a/caffe2/operators/elementwise_div_op.cu ++++ b/caffe2/operators/elementwise_div_op.cu +@@ -3,8 +3,8 @@ + #include <algorithm> + #include <functional> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/elementwise_ops_utils.h" +diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu +index cc49115bffc5..c1c45263f34c 100644 +--- a/caffe2/operators/elementwise_linear_op.cu ++++ b/caffe2/operators/elementwise_linear_op.cu +@@ -5,6 +5,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/operator_fallback_gpu.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu +index bdbf760cf95b..88c3da00edc3 100644 +--- a/caffe2/operators/elementwise_mul_op.cu ++++ b/caffe2/operators/elementwise_mul_op.cu +@@ -3,8 +3,8 @@ + #include <algorithm> + #include <functional> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/elementwise_ops_utils.h" +diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu +index c9ced33cf806..1ac0426d2ca7 100644 +--- a/caffe2/operators/elementwise_ops.cu ++++ b/caffe2/operators/elementwise_ops.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/elementwise_ops.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_load.cuh> + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> +diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu +index f8ff2bab1637..666df335ce42 100644 +--- a/caffe2/operators/find_op.cu ++++ b/caffe2/operators/find_op.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/find_op.h" +diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu +index cab0ad3d0b88..84906a8e8182 100644 +--- a/caffe2/operators/generate_proposals_op.cu ++++ b/caffe2/operators/generate_proposals_op.cu +@@ -1,4 +1,4 @@ +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + #include "caffe2/core/context.h" + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/generate_proposals_op.h" +diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu +index 26df05308d88..468175df985f 100644 +--- a/caffe2/operators/normalize_ops.cu ++++ b/caffe2/operators/normalize_ops.cu +@@ -1,5 +1,6 @@ + #include <algorithm> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu +index e521b3dd09df..86f82f78bb82 100644 +--- a/caffe2/operators/one_hot_ops.cu ++++ b/caffe2/operators/one_hot_ops.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu +index 7475100fd368..b9ed413d1e7b 100644 +--- a/caffe2/operators/pack_segments.cu ++++ b/caffe2/operators/pack_segments.cu +@@ -1,4 +1,4 @@ +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/pack_segments.h" + +diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu +index 745a393f075f..d29882086754 100644 +--- a/caffe2/operators/prelu_op.cu ++++ b/caffe2/operators/prelu_op.cu +@@ -1,6 +1,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/prelu_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu +index ae91f8a6da72..ba62b2eff671 100644 +--- a/caffe2/operators/reduce_front_back_max_ops.cu ++++ b/caffe2/operators/reduce_front_back_max_ops.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_max_ops.h" +diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +index 476596f08425..586c20fe8d8e 100644 +--- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu ++++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_sum_mean_ops.h" +diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu +index ba55a66de588..0d94fab22a7f 100644 +--- a/caffe2/operators/reduction_ops.cu ++++ b/caffe2/operators/reduction_ops.cu +@@ -2,7 +2,7 @@ + #include "caffe2/operators/reduction_ops.h" + #include "caffe2/utils/conversions.h" + +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + + namespace caffe2 { + +diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu +index 76c4d012d71a..39cc5fbc988d 100644 +--- a/caffe2/operators/rmac_regions_op.cu ++++ b/caffe2/operators/rmac_regions_op.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +@@ -10,6 +11,9 @@ + #if defined(USE_ROCM) + namespace rocprim { + #else ++#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++namespace at_cuda_detail { ++#endif + namespace cub { + #endif + +@@ -22,6 +26,9 @@ inline __host__ __device__ bool operator<( + } + + } // namespace cub ++#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++} // namespace at_cuda_detail ++#endif + + namespace caffe2 { + +diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh +index 8d51196ee138..447617c6e9de 100644 +--- a/caffe2/operators/segment_reduction_op_gpu.cuh ++++ b/caffe2/operators/segment_reduction_op_gpu.cuh +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> + #include <cub/device/device_scan.cuh> +diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu +index cc34effd3f22..e66d491f85e6 100644 +--- a/caffe2/operators/sequence_ops.cu ++++ b/caffe2/operators/sequence_ops.cu +@@ -1,6 +1,6 @@ + #include <algorithm> + +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/sequence_ops.h" +diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu +index 51c0cbc2bf6a..c01fcf3e0a48 100644 +--- a/caffe2/operators/softmax_ops.cu ++++ b/caffe2/operators/softmax_ops.cu +@@ -1,4 +1,5 @@ + #include <cfloat> ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh +index edc076c7d718..f9b9fb58adc8 100644 +--- a/caffe2/operators/spatial_batch_norm_op_impl.cuh ++++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh +@@ -5,8 +5,8 @@ + + #include <limits> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/utils/math.h" +diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu +index 2c2ad2cf76ae..396da5195125 100644 +--- a/caffe2/sgd/adagrad_fused_op_gpu.cu ++++ b/caffe2/sgd/adagrad_fused_op_gpu.cu +@@ -2,6 +2,7 @@ + #include <c10/core/GeneratorImpl.h> + #include <algorithm> + ++#include <ATen/cuda/cub.cuh> + #include <cub/device/device_radix_sort.cuh> + #include "caffe2/sgd/adagrad_fused_op_gpu.cuh" + #include "caffe2/utils/math.h" +diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu +index 8abb3376ca87..a6fa842ddc80 100644 +--- a/caffe2/sgd/adagrad_op_gpu.cu ++++ b/caffe2/sgd/adagrad_op_gpu.cu +@@ -1,5 +1,6 @@ + #include <algorithm> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu +index 42ab975faacb..4b59836b6a68 100644 +--- a/caffe2/sgd/adam_op_gpu.cu ++++ b/caffe2/sgd/adam_op_gpu.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu +index fc3e476b288b..20919334da50 100644 +--- a/caffe2/utils/math/reduce.cu ++++ b/caffe2/utils/math/reduce.cu +@@ -6,8 +6,8 @@ + #include <numeric> + #include <vector> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include <thrust/execution_policy.h> + #include <thrust/reduce.h> +diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh +index 0c43ad45a379..39ad553eec76 100644 +--- a/caffe2/utils/math/reduce.cuh ++++ b/caffe2/utils/math/reduce.cuh +@@ -1,8 +1,8 @@ + #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_ + #define CAFFE2_UTILS_MATH_REDUCE_CUH_ + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/common_gpu.h" + +diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu +index a37d4b744d73..b0a44fed34fb 100644 +--- a/caffe2/utils/math_gpu.cu ++++ b/caffe2/utils/math_gpu.cu +@@ -7,8 +7,8 @@ + #include <numeric> + #include <vector> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include <thrust/host_vector.h> + #include <thrust/device_vector.h> +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index ca560288a41a..f127d8f2e5b1 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1622,6 +1622,11 @@ if(NOT INTERN_BUILD_MOBILE) + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-fPIC") + endif() + ++ # include cub in a safe manner, see: ++ # https://github.com/pytorch/pytorch/pull/55292 ++ # https://github.com/NVIDIA/cub/releases/tag/1.14.0 ++ list(APPEND CUDA_NVCC_FLAGS "-DCUB_WRAPPED_NAMESPACE=at_cuda_detail") ++ + if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) + message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor") + list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" + +From f0a6afd7f0a5c6210289869282c77a680d7dfa2e Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Wed, 6 Oct 2021 12:16:48 -0700 +Subject: [PATCH 02/30] fix + +--- + .github/workflows/lint.yml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml +index 4c469878e63f..af472e00140a 100644 +--- a/.github/workflows/lint.yml ++++ b/.github/workflows/lint.yml +@@ -97,7 +97,7 @@ jobs: + - name: Ensure no direct cub include + if: always() + run: | +- (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false)) ++ (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' ':(exclude)aten/src/ATen/cuda/cub_definitions.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false)) + - name: Ensure no raw cuda api calls + if: always() + run: | + +From 8edc6e961f1911f0fe04cb57af13b251f66c9153 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Wed, 6 Oct 2021 12:26:19 -0700 +Subject: [PATCH 03/30] save + +--- + .github/workflows/lint.yml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml +index af472e00140a..0b2abe7a7483 100644 +--- a/.github/workflows/lint.yml ++++ b/.github/workflows/lint.yml +@@ -97,7 +97,7 @@ jobs: + - name: Ensure no direct cub include + if: always() + run: | +- (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' ':(exclude)aten/src/ATen/cuda/cub_definitions.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false)) ++ (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub*.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false)) + - name: Ensure no raw cuda api calls + if: always() + run: | + +From 0d50954a3747ed0f2c7cbfcd58f17a2a81d5929c Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Wed, 6 Oct 2021 12:32:16 -0700 +Subject: [PATCH 04/30] fix + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index a7694fda4168..802f67f22a0f 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -14,4 +14,4 @@ + #define CUB_SUPPORTS_WRAPPED_NAMESPACE() true + #elif + #define CUB_SUPPORTS_WRAPPED_NAMESPACE() false +-#endif +\ No newline at end of file ++#endif + +From 2ac5cf508d694995eee4710d9a6499bb3135d324 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Wed, 6 Oct 2021 14:04:40 -0700 +Subject: [PATCH 05/30] fix + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 802f67f22a0f..1f5628fa9d3a 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -4,7 +4,7 @@ + // https://github.com/NVIDIA/cub/pull/306 + #if CUB_VERSION >= 101300 + #define CUB_SUPPORTS_NV_BFLOAT16() true +-#elif ++#else + #define CUB_SUPPORTS_NV_BFLOAT16() false + #endif + +@@ -12,6 +12,6 @@ + // https://github.com/NVIDIA/cub/pull/326 + #if CUB_VERSION >= 101400 + #define CUB_SUPPORTS_WRAPPED_NAMESPACE() true +-#elif ++#else + #define CUB_SUPPORTS_WRAPPED_NAMESPACE() false + #endif + +From 815f5a5981919f61aaab2d65597826ddf0495ac9 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Wed, 6 Oct 2021 14:06:32 -0700 +Subject: [PATCH 06/30] fix rocm + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 1f5628fa9d3a..7da3ab716e41 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -1,4 +1,10 @@ ++#pragma once ++ ++#if !defined(USE_ROCM) + #include <cub/version.cuh> ++#else ++#define CUB_VERSION 0 ++#endif + + // cub sort support for __nv_bfloat16 is added to cub 1.13 in: + // https://github.com/NVIDIA/cub/pull/306 + +From 02e9ca26bdc658edd1d960dc5d31e277b9b9afc1 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Mon, 11 Oct 2021 10:06:27 -0700 +Subject: [PATCH 07/30] revert caffe2 change + +--- + caffe2/core/context_gpu.cu | 1 - + caffe2/operators/accuracy_op.cu | 1 - + caffe2/operators/affine_channel_op.cu | 1 - + caffe2/operators/arg_ops.cu | 2 +- + caffe2/operators/batch_moments_op.cu | 1 - + caffe2/operators/batch_sparse_to_dense_op.cu | 1 - + caffe2/operators/boolean_mask_ops.cu | 2 +- + caffe2/operators/cross_entropy_op.cu | 1 - + caffe2/operators/distance_op.cu | 1 - + caffe2/operators/elementwise_div_op.cu | 2 +- + caffe2/operators/elementwise_linear_op.cu | 1 - + caffe2/operators/elementwise_mul_op.cu | 2 +- + caffe2/operators/elementwise_ops.cu | 1 - + caffe2/operators/find_op.cu | 1 - + caffe2/operators/generate_proposals_op.cu | 2 +- + caffe2/operators/normalize_ops.cu | 1 - + caffe2/operators/one_hot_ops.cu | 1 - + caffe2/operators/pack_segments.cu | 2 +- + caffe2/operators/prelu_op.cu | 1 - + caffe2/operators/reduce_front_back_max_ops.cu | 1 - + caffe2/operators/reduce_front_back_sum_mean_ops.cu | 1 - + caffe2/operators/reduction_ops.cu | 2 +- + caffe2/operators/rmac_regions_op.cu | 7 ------- + caffe2/operators/segment_reduction_op_gpu.cuh | 1 - + caffe2/operators/sequence_ops.cu | 2 +- + caffe2/operators/softmax_ops.cu | 1 - + caffe2/operators/spatial_batch_norm_op_impl.cuh | 2 +- + caffe2/sgd/adagrad_fused_op_gpu.cu | 1 - + caffe2/sgd/adagrad_op_gpu.cu | 1 - + caffe2/sgd/adam_op_gpu.cu | 1 - + caffe2/utils/math/reduce.cu | 2 +- + caffe2/utils/math/reduce.cuh | 2 +- + caffe2/utils/math_gpu.cu | 2 +- + 33 files changed, 12 insertions(+), 39 deletions(-) + +diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu +index 475ed61ab4f7..c2b89945ada9 100644 +--- a/caffe2/core/context_gpu.cu ++++ b/caffe2/core/context_gpu.cu +@@ -4,7 +4,6 @@ + #include <string> + #include <unordered_map> + +-#include <ATen/cuda/cub.cuh> + #include <ATen/Context.h> + #include <c10/cuda/CUDAFunctions.h> + #include <c10/cuda/CUDACachingAllocator.h> +diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu +index 7ad2b09c238a..f06663d71a90 100644 +--- a/caffe2/operators/accuracy_op.cu ++++ b/caffe2/operators/accuracy_op.cu +@@ -3,7 +3,6 @@ + #include "caffe2/utils/GpuAtomics.cuh" + #include "caffe2/utils/math.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu +index f3d9e22c7e8c..adf4ac55c0fc 100644 +--- a/caffe2/operators/affine_channel_op.cu ++++ b/caffe2/operators/affine_channel_op.cu +@@ -1,6 +1,5 @@ + #include "caffe2/operators/affine_channel_op.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu +index fbefe0774376..7e90d25b836b 100644 +--- a/caffe2/operators/arg_ops.cu ++++ b/caffe2/operators/arg_ops.cu +@@ -2,8 +2,8 @@ + + #include <limits> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include <cub/cub.cuh> + + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu +index 65c43200e5bd..4b693b5c04e2 100644 +--- a/caffe2/operators/batch_moments_op.cu ++++ b/caffe2/operators/batch_moments_op.cu +@@ -1,6 +1,5 @@ + #include "caffe2/operators/batch_moments_op.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu +index 2cb09deb8668..aea2035a5d3d 100644 +--- a/caffe2/operators/batch_sparse_to_dense_op.cu ++++ b/caffe2/operators/batch_sparse_to_dense_op.cu +@@ -1,6 +1,5 @@ + #include "caffe2/operators/batch_sparse_to_dense_op.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/device/device_scan.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu +index c87688f51d64..214b7c13ba3c 100644 +--- a/caffe2/operators/boolean_mask_ops.cu ++++ b/caffe2/operators/boolean_mask_ops.cu +@@ -3,7 +3,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/boolean_mask_ops.h" + +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + + namespace caffe2 { + +diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu +index 95f3ffddbf1f..380e80399fc3 100644 +--- a/caffe2/operators/cross_entropy_op.cu ++++ b/caffe2/operators/cross_entropy_op.cu +@@ -1,5 +1,4 @@ + #include <assert.h> +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu +index d94691d5a9d9..3a8bb337d541 100644 +--- a/caffe2/operators/distance_op.cu ++++ b/caffe2/operators/distance_op.cu +@@ -4,7 +4,6 @@ + #include "caffe2/operators/distance_op.h" + #include "caffe2/utils/conversions.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu +index ca9682326324..42b103a0f110 100644 +--- a/caffe2/operators/elementwise_div_op.cu ++++ b/caffe2/operators/elementwise_div_op.cu +@@ -3,8 +3,8 @@ + #include <algorithm> + #include <functional> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/elementwise_ops_utils.h" +diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu +index c1c45263f34c..cc49115bffc5 100644 +--- a/caffe2/operators/elementwise_linear_op.cu ++++ b/caffe2/operators/elementwise_linear_op.cu +@@ -5,7 +5,6 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/operator_fallback_gpu.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu +index 88c3da00edc3..bdbf760cf95b 100644 +--- a/caffe2/operators/elementwise_mul_op.cu ++++ b/caffe2/operators/elementwise_mul_op.cu +@@ -3,8 +3,8 @@ + #include <algorithm> + #include <functional> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/elementwise_ops_utils.h" +diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu +index 1ac0426d2ca7..c9ced33cf806 100644 +--- a/caffe2/operators/elementwise_ops.cu ++++ b/caffe2/operators/elementwise_ops.cu +@@ -1,6 +1,5 @@ + #include "caffe2/operators/elementwise_ops.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_load.cuh> + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> +diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu +index 666df335ce42..f8ff2bab1637 100644 +--- a/caffe2/operators/find_op.cu ++++ b/caffe2/operators/find_op.cu +@@ -1,4 +1,3 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/find_op.h" +diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu +index 84906a8e8182..cab0ad3d0b88 100644 +--- a/caffe2/operators/generate_proposals_op.cu ++++ b/caffe2/operators/generate_proposals_op.cu +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + #include "caffe2/core/context.h" + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/generate_proposals_op.h" +diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu +index 468175df985f..26df05308d88 100644 +--- a/caffe2/operators/normalize_ops.cu ++++ b/caffe2/operators/normalize_ops.cu +@@ -1,6 +1,5 @@ + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu +index 86f82f78bb82..e521b3dd09df 100644 +--- a/caffe2/operators/one_hot_ops.cu ++++ b/caffe2/operators/one_hot_ops.cu +@@ -1,4 +1,3 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu +index b9ed413d1e7b..7475100fd368 100644 +--- a/caffe2/operators/pack_segments.cu ++++ b/caffe2/operators/pack_segments.cu +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/pack_segments.h" + +diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu +index d29882086754..745a393f075f 100644 +--- a/caffe2/operators/prelu_op.cu ++++ b/caffe2/operators/prelu_op.cu +@@ -1,7 +1,6 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/prelu_op.h" + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu +index ba62b2eff671..ae91f8a6da72 100644 +--- a/caffe2/operators/reduce_front_back_max_ops.cu ++++ b/caffe2/operators/reduce_front_back_max_ops.cu +@@ -1,4 +1,3 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_max_ops.h" +diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +index 586c20fe8d8e..476596f08425 100644 +--- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu ++++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +@@ -1,4 +1,3 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_sum_mean_ops.h" +diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu +index 0d94fab22a7f..ba55a66de588 100644 +--- a/caffe2/operators/reduction_ops.cu ++++ b/caffe2/operators/reduction_ops.cu +@@ -2,7 +2,7 @@ + #include "caffe2/operators/reduction_ops.h" + #include "caffe2/utils/conversions.h" + +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + + namespace caffe2 { + +diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu +index 39cc5fbc988d..76c4d012d71a 100644 +--- a/caffe2/operators/rmac_regions_op.cu ++++ b/caffe2/operators/rmac_regions_op.cu +@@ -1,4 +1,3 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +@@ -11,9 +10,6 @@ + #if defined(USE_ROCM) + namespace rocprim { + #else +-#if CUB_SUPPORTS_WRAPPED_NAMESPACE() +-namespace at_cuda_detail { +-#endif + namespace cub { + #endif + +@@ -26,9 +22,6 @@ inline __host__ __device__ bool operator<( + } + + } // namespace cub +-#if CUB_SUPPORTS_WRAPPED_NAMESPACE() +-} // namespace at_cuda_detail +-#endif + + namespace caffe2 { + +diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh +index 447617c6e9de..8d51196ee138 100644 +--- a/caffe2/operators/segment_reduction_op_gpu.cuh ++++ b/caffe2/operators/segment_reduction_op_gpu.cuh +@@ -1,4 +1,3 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> + #include <cub/device/device_scan.cuh> +diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu +index e66d491f85e6..cc34effd3f22 100644 +--- a/caffe2/operators/sequence_ops.cu ++++ b/caffe2/operators/sequence_ops.cu +@@ -1,6 +1,6 @@ + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/sequence_ops.h" +diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu +index c01fcf3e0a48..51c0cbc2bf6a 100644 +--- a/caffe2/operators/softmax_ops.cu ++++ b/caffe2/operators/softmax_ops.cu +@@ -1,5 +1,4 @@ + #include <cfloat> +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh +index f9b9fb58adc8..edc076c7d718 100644 +--- a/caffe2/operators/spatial_batch_norm_op_impl.cuh ++++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh +@@ -5,8 +5,8 @@ + + #include <limits> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/utils/math.h" +diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu +index 396da5195125..2c2ad2cf76ae 100644 +--- a/caffe2/sgd/adagrad_fused_op_gpu.cu ++++ b/caffe2/sgd/adagrad_fused_op_gpu.cu +@@ -2,7 +2,6 @@ + #include <c10/core/GeneratorImpl.h> + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> + #include <cub/device/device_radix_sort.cuh> + #include "caffe2/sgd/adagrad_fused_op_gpu.cuh" + #include "caffe2/utils/math.h" +diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu +index a6fa842ddc80..8abb3376ca87 100644 +--- a/caffe2/sgd/adagrad_op_gpu.cu ++++ b/caffe2/sgd/adagrad_op_gpu.cu +@@ -1,6 +1,5 @@ + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu +index 4b59836b6a68..42ab975faacb 100644 +--- a/caffe2/sgd/adam_op_gpu.cu ++++ b/caffe2/sgd/adam_op_gpu.cu +@@ -1,4 +1,3 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu +index 20919334da50..fc3e476b288b 100644 +--- a/caffe2/utils/math/reduce.cu ++++ b/caffe2/utils/math/reduce.cu +@@ -6,8 +6,8 @@ + #include <numeric> + #include <vector> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include <cub/cub.cuh> + + #include <thrust/execution_policy.h> + #include <thrust/reduce.h> +diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh +index 39ad553eec76..0c43ad45a379 100644 +--- a/caffe2/utils/math/reduce.cuh ++++ b/caffe2/utils/math/reduce.cuh +@@ -1,8 +1,8 @@ + #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_ + #define CAFFE2_UTILS_MATH_REDUCE_CUH_ + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include <cub/cub.cuh> + + #include "caffe2/core/common_gpu.h" + +diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu +index b0a44fed34fb..a37d4b744d73 100644 +--- a/caffe2/utils/math_gpu.cu ++++ b/caffe2/utils/math_gpu.cu +@@ -7,8 +7,8 @@ + #include <numeric> + #include <vector> + +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include <cub/cub.cuh> + + #include <thrust/host_vector.h> + #include <thrust/device_vector.h> + +From a9ca6d97ff78a4a7a6a6dca90b21ad24666bf6fe Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Mon, 11 Oct 2021 10:27:36 -0700 +Subject: [PATCH 08/30] save + +--- + aten/src/ATen/cuda/cub.cuh | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 39938efc48be..23e90a81a16c 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -9,6 +9,7 @@ + + #if CUB_SUPPORTS_WRAPPED_NAMESPACE() + ++#define CUB_WRAPPED_NAMESPACE at_cuda_detail + #include <cub/cub.cuh> + + #else + +From 3695222a89a011cf58dfdb907515ffc72231f3af Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Mon, 11 Oct 2021 10:51:25 -0700 +Subject: [PATCH 09/30] fix + +--- + aten/src/ATen/cuda/cub.cuh | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 23e90a81a16c..9014d4b7d2ec 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -67,12 +67,11 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT + } + #endif + ++#if !defined(USE_ROCM) + namespace at { namespace native { + namespace cub = at_cuda_detail::cub; + }} +-namespace caffe2 { +-namespace cub = at_cuda_detail::cub; +-} ++#endif + + namespace at { + namespace cuda { + +From 4d94df2cf61598bb6078364a0f628cb0321cdff5 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 07:54:47 -0700 +Subject: [PATCH 10/30] Revert "revert caffe2 change" + +This reverts commit 02e9ca26bdc658edd1d960dc5d31e277b9b9afc1. +--- + caffe2/core/context_gpu.cu | 1 + + caffe2/operators/accuracy_op.cu | 1 + + caffe2/operators/affine_channel_op.cu | 1 + + caffe2/operators/arg_ops.cu | 2 +- + caffe2/operators/batch_moments_op.cu | 1 + + caffe2/operators/batch_sparse_to_dense_op.cu | 1 + + caffe2/operators/boolean_mask_ops.cu | 2 +- + caffe2/operators/cross_entropy_op.cu | 1 + + caffe2/operators/distance_op.cu | 1 + + caffe2/operators/elementwise_div_op.cu | 2 +- + caffe2/operators/elementwise_linear_op.cu | 1 + + caffe2/operators/elementwise_mul_op.cu | 2 +- + caffe2/operators/elementwise_ops.cu | 1 + + caffe2/operators/find_op.cu | 1 + + caffe2/operators/generate_proposals_op.cu | 2 +- + caffe2/operators/normalize_ops.cu | 1 + + caffe2/operators/one_hot_ops.cu | 1 + + caffe2/operators/pack_segments.cu | 2 +- + caffe2/operators/prelu_op.cu | 1 + + caffe2/operators/reduce_front_back_max_ops.cu | 1 + + caffe2/operators/reduce_front_back_sum_mean_ops.cu | 1 + + caffe2/operators/reduction_ops.cu | 2 +- + caffe2/operators/rmac_regions_op.cu | 7 +++++++ + caffe2/operators/segment_reduction_op_gpu.cuh | 1 + + caffe2/operators/sequence_ops.cu | 2 +- + caffe2/operators/softmax_ops.cu | 1 + + caffe2/operators/spatial_batch_norm_op_impl.cuh | 2 +- + caffe2/sgd/adagrad_fused_op_gpu.cu | 1 + + caffe2/sgd/adagrad_op_gpu.cu | 1 + + caffe2/sgd/adam_op_gpu.cu | 1 + + caffe2/utils/math/reduce.cu | 2 +- + caffe2/utils/math/reduce.cuh | 2 +- + caffe2/utils/math_gpu.cu | 2 +- + 33 files changed, 39 insertions(+), 12 deletions(-) + +diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu +index c2b89945ada9..475ed61ab4f7 100644 +--- a/caffe2/core/context_gpu.cu ++++ b/caffe2/core/context_gpu.cu +@@ -4,6 +4,7 @@ + #include <string> + #include <unordered_map> + ++#include <ATen/cuda/cub.cuh> + #include <ATen/Context.h> + #include <c10/cuda/CUDAFunctions.h> + #include <c10/cuda/CUDACachingAllocator.h> +diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu +index f06663d71a90..7ad2b09c238a 100644 +--- a/caffe2/operators/accuracy_op.cu ++++ b/caffe2/operators/accuracy_op.cu +@@ -3,6 +3,7 @@ + #include "caffe2/utils/GpuAtomics.cuh" + #include "caffe2/utils/math.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu +index adf4ac55c0fc..f3d9e22c7e8c 100644 +--- a/caffe2/operators/affine_channel_op.cu ++++ b/caffe2/operators/affine_channel_op.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/affine_channel_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu +index 7e90d25b836b..fbefe0774376 100644 +--- a/caffe2/operators/arg_ops.cu ++++ b/caffe2/operators/arg_ops.cu +@@ -2,8 +2,8 @@ + + #include <limits> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu +index 4b693b5c04e2..65c43200e5bd 100644 +--- a/caffe2/operators/batch_moments_op.cu ++++ b/caffe2/operators/batch_moments_op.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/batch_moments_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu +index aea2035a5d3d..2cb09deb8668 100644 +--- a/caffe2/operators/batch_sparse_to_dense_op.cu ++++ b/caffe2/operators/batch_sparse_to_dense_op.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/batch_sparse_to_dense_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/device/device_scan.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu +index 214b7c13ba3c..c87688f51d64 100644 +--- a/caffe2/operators/boolean_mask_ops.cu ++++ b/caffe2/operators/boolean_mask_ops.cu +@@ -3,7 +3,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/boolean_mask_ops.h" + +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + + namespace caffe2 { + +diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu +index 380e80399fc3..95f3ffddbf1f 100644 +--- a/caffe2/operators/cross_entropy_op.cu ++++ b/caffe2/operators/cross_entropy_op.cu +@@ -1,4 +1,5 @@ + #include <assert.h> ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu +index 3a8bb337d541..d94691d5a9d9 100644 +--- a/caffe2/operators/distance_op.cu ++++ b/caffe2/operators/distance_op.cu +@@ -4,6 +4,7 @@ + #include "caffe2/operators/distance_op.h" + #include "caffe2/utils/conversions.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu +index 42b103a0f110..ca9682326324 100644 +--- a/caffe2/operators/elementwise_div_op.cu ++++ b/caffe2/operators/elementwise_div_op.cu +@@ -3,8 +3,8 @@ + #include <algorithm> + #include <functional> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/elementwise_ops_utils.h" +diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu +index cc49115bffc5..c1c45263f34c 100644 +--- a/caffe2/operators/elementwise_linear_op.cu ++++ b/caffe2/operators/elementwise_linear_op.cu +@@ -5,6 +5,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/operator_fallback_gpu.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu +index bdbf760cf95b..88c3da00edc3 100644 +--- a/caffe2/operators/elementwise_mul_op.cu ++++ b/caffe2/operators/elementwise_mul_op.cu +@@ -3,8 +3,8 @@ + #include <algorithm> + #include <functional> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/elementwise_ops_utils.h" +diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu +index c9ced33cf806..1ac0426d2ca7 100644 +--- a/caffe2/operators/elementwise_ops.cu ++++ b/caffe2/operators/elementwise_ops.cu +@@ -1,5 +1,6 @@ + #include "caffe2/operators/elementwise_ops.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_load.cuh> + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> +diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu +index f8ff2bab1637..666df335ce42 100644 +--- a/caffe2/operators/find_op.cu ++++ b/caffe2/operators/find_op.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/find_op.h" +diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu +index cab0ad3d0b88..84906a8e8182 100644 +--- a/caffe2/operators/generate_proposals_op.cu ++++ b/caffe2/operators/generate_proposals_op.cu +@@ -1,4 +1,4 @@ +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + #include "caffe2/core/context.h" + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/generate_proposals_op.h" +diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu +index 26df05308d88..468175df985f 100644 +--- a/caffe2/operators/normalize_ops.cu ++++ b/caffe2/operators/normalize_ops.cu +@@ -1,5 +1,6 @@ + #include <algorithm> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu +index e521b3dd09df..86f82f78bb82 100644 +--- a/caffe2/operators/one_hot_ops.cu ++++ b/caffe2/operators/one_hot_ops.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu +index 7475100fd368..b9ed413d1e7b 100644 +--- a/caffe2/operators/pack_segments.cu ++++ b/caffe2/operators/pack_segments.cu +@@ -1,4 +1,4 @@ +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/pack_segments.h" + +diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu +index 745a393f075f..d29882086754 100644 +--- a/caffe2/operators/prelu_op.cu ++++ b/caffe2/operators/prelu_op.cu +@@ -1,6 +1,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/prelu_op.h" + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu +index ae91f8a6da72..ba62b2eff671 100644 +--- a/caffe2/operators/reduce_front_back_max_ops.cu ++++ b/caffe2/operators/reduce_front_back_max_ops.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_max_ops.h" +diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +index 476596f08425..586c20fe8d8e 100644 +--- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu ++++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_sum_mean_ops.h" +diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu +index ba55a66de588..0d94fab22a7f 100644 +--- a/caffe2/operators/reduction_ops.cu ++++ b/caffe2/operators/reduction_ops.cu +@@ -2,7 +2,7 @@ + #include "caffe2/operators/reduction_ops.h" + #include "caffe2/utils/conversions.h" + +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + + namespace caffe2 { + +diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu +index 76c4d012d71a..39cc5fbc988d 100644 +--- a/caffe2/operators/rmac_regions_op.cu ++++ b/caffe2/operators/rmac_regions_op.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +@@ -10,6 +11,9 @@ + #if defined(USE_ROCM) + namespace rocprim { + #else ++#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++namespace at_cuda_detail { ++#endif + namespace cub { + #endif + +@@ -22,6 +26,9 @@ inline __host__ __device__ bool operator<( + } + + } // namespace cub ++#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++} // namespace at_cuda_detail ++#endif + + namespace caffe2 { + +diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh +index 8d51196ee138..447617c6e9de 100644 +--- a/caffe2/operators/segment_reduction_op_gpu.cuh ++++ b/caffe2/operators/segment_reduction_op_gpu.cuh +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> + #include <cub/device/device_scan.cuh> +diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu +index cc34effd3f22..e66d491f85e6 100644 +--- a/caffe2/operators/sequence_ops.cu ++++ b/caffe2/operators/sequence_ops.cu +@@ -1,6 +1,6 @@ + #include <algorithm> + +-#include <cub/cub.cuh> ++#include <ATen/cuda/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/sequence_ops.h" +diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu +index 51c0cbc2bf6a..c01fcf3e0a48 100644 +--- a/caffe2/operators/softmax_ops.cu ++++ b/caffe2/operators/softmax_ops.cu +@@ -1,4 +1,5 @@ + #include <cfloat> ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh +index edc076c7d718..f9b9fb58adc8 100644 +--- a/caffe2/operators/spatial_batch_norm_op_impl.cuh ++++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh +@@ -5,8 +5,8 @@ + + #include <limits> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/utils/math.h" +diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu +index 2c2ad2cf76ae..396da5195125 100644 +--- a/caffe2/sgd/adagrad_fused_op_gpu.cu ++++ b/caffe2/sgd/adagrad_fused_op_gpu.cu +@@ -2,6 +2,7 @@ + #include <c10/core/GeneratorImpl.h> + #include <algorithm> + ++#include <ATen/cuda/cub.cuh> + #include <cub/device/device_radix_sort.cuh> + #include "caffe2/sgd/adagrad_fused_op_gpu.cuh" + #include "caffe2/utils/math.h" +diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu +index 8abb3376ca87..a6fa842ddc80 100644 +--- a/caffe2/sgd/adagrad_op_gpu.cu ++++ b/caffe2/sgd/adagrad_op_gpu.cu +@@ -1,5 +1,6 @@ + #include <algorithm> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu +index 42ab975faacb..4b59836b6a68 100644 +--- a/caffe2/sgd/adam_op_gpu.cu ++++ b/caffe2/sgd/adam_op_gpu.cu +@@ -1,3 +1,4 @@ ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu +index fc3e476b288b..20919334da50 100644 +--- a/caffe2/utils/math/reduce.cu ++++ b/caffe2/utils/math/reduce.cu +@@ -6,8 +6,8 @@ + #include <numeric> + #include <vector> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include <thrust/execution_policy.h> + #include <thrust/reduce.h> +diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh +index 0c43ad45a379..39ad553eec76 100644 +--- a/caffe2/utils/math/reduce.cuh ++++ b/caffe2/utils/math/reduce.cuh +@@ -1,8 +1,8 @@ + #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_ + #define CAFFE2_UTILS_MATH_REDUCE_CUH_ + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include "caffe2/core/common_gpu.h" + +diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu +index a37d4b744d73..b0a44fed34fb 100644 +--- a/caffe2/utils/math_gpu.cu ++++ b/caffe2/utils/math_gpu.cu +@@ -7,8 +7,8 @@ + #include <numeric> + #include <vector> + ++#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> +-#include <cub/cub.cuh> + + #include <thrust/host_vector.h> + #include <thrust/device_vector.h> + +From 359666307907780e946284445e609131652a4739 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 08:04:01 -0700 +Subject: [PATCH 11/30] save + +--- + aten/src/ATen/cuda/cub.cuh | 4 +++- + aten/src/ATen/cuda/cub_definitions.cuh | 5 ++++- + cmake/Dependencies.cmake | 4 ++++ + 3 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 9014d4b7d2ec..4f39bde4b3c8 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -9,7 +9,6 @@ + + #if CUB_SUPPORTS_WRAPPED_NAMESPACE() + +-#define CUB_WRAPPED_NAMESPACE at_cuda_detail + #include <cub/cub.cuh> + + #else +@@ -71,6 +70,9 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT + namespace at { namespace native { + namespace cub = at_cuda_detail::cub; + }} ++namespace caffew { ++namespace cub = at_cuda_detail::cub; ++} + #endif + + namespace at { +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 7da3ab716e41..b4c82c9ae975 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -1,6 +1,7 @@ + #pragma once + + #if !defined(USE_ROCM) ++#include <cuda.h> + #include <cub/version.cuh> + #else + #define CUB_VERSION 0 +@@ -16,7 +17,9 @@ + + // cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in: + // https://github.com/NVIDIA/cub/pull/326 +-#if CUB_VERSION >= 101400 ++// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake ++// starting from CUDA 11.4 ++#if defined(CUDA_VERSION) && CUDA_VERSION >= 11040 + #define CUB_SUPPORTS_WRAPPED_NAMESPACE() true + #else + #define CUB_SUPPORTS_WRAPPED_NAMESPACE() false +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 564fcebc0b08..bfee8f6dc78a 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1618,6 +1618,10 @@ if(NOT INTERN_BUILD_MOBILE) + set(CMAKE_CXX_STANDARD 14) + endif() + ++ if(NOT ${CUDA_VERSION} LESS 11.4) ++ string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail") ++ endif() ++ + if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) + message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor") + string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1" + +From d4b3679d45cadbf9123a869cfefa001832bce04a Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 08:09:00 -0700 +Subject: [PATCH 12/30] save + +--- + aten/src/ATen/cuda/cub.cuh | 2 +- + aten/src/ATen/cuda/cub_definitions.cuh | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 4f39bde4b3c8..2583b7e640ec 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -7,7 +7,7 @@ + + #include <ATen/cuda/cub_definitions.cuh> + +-#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() + + #include <cub/cub.cuh> + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index b4c82c9ae975..160d6f157396 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -20,7 +20,7 @@ + // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake + // starting from CUDA 11.4 + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11040 +-#define CUB_SUPPORTS_WRAPPED_NAMESPACE() true ++#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true + #else +-#define CUB_SUPPORTS_WRAPPED_NAMESPACE() false ++#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false + #endif + +From 2b710dab756fceadf393d1c38cec91ef000e8fa8 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 08:23:39 -0700 +Subject: [PATCH 13/30] fix + +--- + aten/src/ATen/cuda/cub.cuh | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 2583b7e640ec..042009f3d8b4 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -68,10 +68,10 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT + + #if !defined(USE_ROCM) + namespace at { namespace native { +-namespace cub = at_cuda_detail::cub; ++namespace cub = ::at_cuda_detail::cub; + }} +-namespace caffew { +-namespace cub = at_cuda_detail::cub; ++namespace caffe2 { ++namespace cub = ::at_cuda_detail::cub; + } + #endif + + +From 34c57ca996c3249fe23e7037f98d1a9638187371 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 08:31:35 -0700 +Subject: [PATCH 14/30] fix + +--- + caffe2/operators/rmac_regions_op.cu | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu +index 39cc5fbc988d..2f6c230574b0 100644 +--- a/caffe2/operators/rmac_regions_op.cu ++++ b/caffe2/operators/rmac_regions_op.cu +@@ -11,7 +11,7 @@ + #if defined(USE_ROCM) + namespace rocprim { + #else +-#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() + namespace at_cuda_detail { + #endif + namespace cub { +@@ -26,7 +26,7 @@ inline __host__ __device__ bool operator<( + } + + } // namespace cub +-#if CUB_SUPPORTS_WRAPPED_NAMESPACE() ++#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() + } // namespace at_cuda_detail + #endif + + +From 3f6bce369e20fc75160c8151064e5cf3c4d871ce Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 08:56:27 -0700 +Subject: [PATCH 16/30] save + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 160d6f157396..7746885d6428 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -15,11 +15,11 @@ + #define CUB_SUPPORTS_NV_BFLOAT16() false + #endif + +-// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in: ++// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: + // https://github.com/NVIDIA/cub/pull/326 + // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake + // starting from CUDA 11.4 +-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11040 ++#if CUB_VERSION >= 101301 + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true + #else + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false + +From a2346ec6c5804085e49e81007ed70fd1aae0f333 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 09:04:04 -0700 +Subject: [PATCH 17/30] save + +--- + aten/src/ATen/cuda/cub.cuh | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 042009f3d8b4..17062586b0e3 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -17,9 +17,8 @@ + #undef CUB_NS_PREFIX + #define CUB_NS_PREFIX namespace at_cuda_detail { + #define CUB_NS_POSTFIX } ++#define CUB_NS_QUALIFIER ::at_cuda_detail::cub + #include <cub/cub.cuh> +-#undef CUB_NS_POSTFIX +-#undef CUB_NS_PREFIX + + #endif + + +From add451ed3a579a794609bb26e82b31cec8527d03 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 09:06:53 -0700 +Subject: [PATCH 18/30] save + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 7746885d6428..fb43441148fc 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -19,7 +19,7 @@ + // https://github.com/NVIDIA/cub/pull/326 + // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake + // starting from CUDA 11.4 +-#if CUB_VERSION >= 101301 ++#if CUB_VERSION >= 101400 + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true + #else + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false + +From d1113be1236633978a51581ad9270069ddea5c30 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 09:07:12 -0700 +Subject: [PATCH 19/30] save + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index fb43441148fc..4b339b92c6dc 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -18,7 +18,7 @@ + // cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: + // https://github.com/NVIDIA/cub/pull/326 + // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake +-// starting from CUDA 11.4 ++// starting from CUDA 11.5 + #if CUB_VERSION >= 101400 + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true + #else + +From 59949c2c4945f7df89888bbbfb44060ea19e31ff Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 11:45:18 -0700 +Subject: [PATCH 20/30] save + +--- + aten/src/ATen/cuda/cub.cuh | 8 ++++---- + aten/src/ATen/cuda/cub_definitions.cuh | 8 ++++---- + caffe2/core/context_gpu.cu | 2 +- + caffe2/operators/accuracy_op.cu | 2 +- + caffe2/operators/affine_channel_op.cu | 2 +- + caffe2/operators/arg_ops.cu | 2 +- + caffe2/operators/batch_moments_op.cu | 2 +- + caffe2/operators/batch_sparse_to_dense_op.cu | 2 +- + caffe2/operators/boolean_mask_ops.cu | 4 ++-- + caffe2/operators/cross_entropy_op.cu | 2 +- + caffe2/operators/distance_op.cu | 2 +- + caffe2/operators/elementwise_div_op.cu | 2 +- + caffe2/operators/elementwise_linear_op.cu | 2 +- + caffe2/operators/elementwise_mul_op.cu | 2 +- + caffe2/operators/elementwise_ops.cu | 2 +- + caffe2/operators/find_op.cu | 2 +- + caffe2/operators/generate_proposals_op.cu | 3 ++- + caffe2/operators/normalize_ops.cu | 2 +- + caffe2/operators/one_hot_ops.cu | 2 +- + caffe2/operators/pack_segments.cu | 3 ++- + caffe2/operators/prelu_op.cu | 2 +- + caffe2/operators/reduce_front_back_max_ops.cu | 2 +- + caffe2/operators/reduce_front_back_sum_mean_ops.cu | 2 +- + caffe2/operators/reduction_ops.cu | 2 +- + caffe2/operators/rmac_regions_op.cu | 2 +- + caffe2/operators/segment_reduction_op_gpu.cuh | 2 +- + caffe2/operators/sequence_ops.cu | 3 ++- + caffe2/operators/softmax_ops.cu | 2 +- + caffe2/operators/spatial_batch_norm_op_impl.cuh | 2 +- + caffe2/sgd/adagrad_fused_op_gpu.cu | 2 +- + caffe2/sgd/adagrad_op_gpu.cu | 2 +- + caffe2/sgd/adam_op_gpu.cu | 2 +- + caffe2/utils/cub_namespace.cuh | 7 +++++++ + caffe2/utils/math/reduce.cu | 3 +-- + caffe2/utils/math/reduce.cuh | 2 +- + caffe2/utils/math_gpu.cu | 3 ++- + cmake/Dependencies.cmake | 2 +- + 37 files changed, 54 insertions(+), 44 deletions(-) + create mode 100644 caffe2/utils/cub_namespace.cuh + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index 17062586b0e3..de14455adb98 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -15,11 +15,14 @@ + + #undef CUB_NS_POSTFIX //undef to avoid redefinition warnings + #undef CUB_NS_PREFIX ++#undef CUB_NS_QUALIFIER + #define CUB_NS_PREFIX namespace at_cuda_detail { + #define CUB_NS_POSTFIX } + #define CUB_NS_QUALIFIER ::at_cuda_detail::cub + #include <cub/cub.cuh> +- ++#undef CUB_NS_POSTFIX ++#undef CUB_NS_PREFIX ++#undef CUB_NS_QUALIFIER + #endif + + #include <ATen/cuda/Exceptions.h> +@@ -69,9 +72,6 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT + namespace at { namespace native { + namespace cub = ::at_cuda_detail::cub; + }} +-namespace caffe2 { +-namespace cub = ::at_cuda_detail::cub; +-} + #endif + + namespace at { +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 4b339b92c6dc..07cffe9d34e4 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -15,12 +15,12 @@ + #define CUB_SUPPORTS_NV_BFLOAT16() false + #endif + +-// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: ++// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in: + // https://github.com/NVIDIA/cub/pull/326 + // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake +-// starting from CUDA 11.5 +-#if CUB_VERSION >= 101400 ++// starting from CUDA 11.6 ++#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE) + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true + #else + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false +-#endif ++#endif +\ No newline at end of file +diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu +index 475ed61ab4f7..9ba9f74d5376 100644 +--- a/caffe2/core/context_gpu.cu ++++ b/caffe2/core/context_gpu.cu +@@ -4,7 +4,7 @@ + #include <string> + #include <unordered_map> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <ATen/Context.h> + #include <c10/cuda/CUDAFunctions.h> + #include <c10/cuda/CUDACachingAllocator.h> +diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu +index 7ad2b09c238a..29df54e752d3 100644 +--- a/caffe2/operators/accuracy_op.cu ++++ b/caffe2/operators/accuracy_op.cu +@@ -3,7 +3,7 @@ + #include "caffe2/utils/GpuAtomics.cuh" + #include "caffe2/utils/math.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu +index f3d9e22c7e8c..efae0a3fc695 100644 +--- a/caffe2/operators/affine_channel_op.cu ++++ b/caffe2/operators/affine_channel_op.cu +@@ -1,6 +1,6 @@ + #include "caffe2/operators/affine_channel_op.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu +index fbefe0774376..56deaa636356 100644 +--- a/caffe2/operators/arg_ops.cu ++++ b/caffe2/operators/arg_ops.cu +@@ -2,7 +2,7 @@ + + #include <limits> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/common_gpu.h" +diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu +index 65c43200e5bd..81359f6440b0 100644 +--- a/caffe2/operators/batch_moments_op.cu ++++ b/caffe2/operators/batch_moments_op.cu +@@ -1,6 +1,6 @@ + #include "caffe2/operators/batch_moments_op.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu +index 2cb09deb8668..3e7ad8af9a5b 100644 +--- a/caffe2/operators/batch_sparse_to_dense_op.cu ++++ b/caffe2/operators/batch_sparse_to_dense_op.cu +@@ -1,6 +1,6 @@ + #include "caffe2/operators/batch_sparse_to_dense_op.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/device/device_scan.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu +index c87688f51d64..501dd3b191c8 100644 +--- a/caffe2/operators/boolean_mask_ops.cu ++++ b/caffe2/operators/boolean_mask_ops.cu +@@ -2,8 +2,8 @@ + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/boolean_mask_ops.h" +- +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu +index 95f3ffddbf1f..15cb8a4f574a 100644 +--- a/caffe2/operators/cross_entropy_op.cu ++++ b/caffe2/operators/cross_entropy_op.cu +@@ -1,5 +1,5 @@ + #include <assert.h> +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu +index d94691d5a9d9..a360166854ff 100644 +--- a/caffe2/operators/distance_op.cu ++++ b/caffe2/operators/distance_op.cu +@@ -4,7 +4,7 @@ + #include "caffe2/operators/distance_op.h" + #include "caffe2/utils/conversions.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu +index ca9682326324..33118a8f5e16 100644 +--- a/caffe2/operators/elementwise_div_op.cu ++++ b/caffe2/operators/elementwise_div_op.cu +@@ -3,7 +3,7 @@ + #include <algorithm> + #include <functional> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu +index c1c45263f34c..8f749644b295 100644 +--- a/caffe2/operators/elementwise_linear_op.cu ++++ b/caffe2/operators/elementwise_linear_op.cu +@@ -5,7 +5,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/operator_fallback_gpu.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu +index 88c3da00edc3..1991b8b513af 100644 +--- a/caffe2/operators/elementwise_mul_op.cu ++++ b/caffe2/operators/elementwise_mul_op.cu +@@ -3,7 +3,7 @@ + #include <algorithm> + #include <functional> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu +index 1ac0426d2ca7..932bd5dafda0 100644 +--- a/caffe2/operators/elementwise_ops.cu ++++ b/caffe2/operators/elementwise_ops.cu +@@ -1,6 +1,6 @@ + #include "caffe2/operators/elementwise_ops.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_load.cuh> + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> +diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu +index 666df335ce42..20d42560b506 100644 +--- a/caffe2/operators/find_op.cu ++++ b/caffe2/operators/find_op.cu +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/find_op.h" +diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu +index 84906a8e8182..b63726651939 100644 +--- a/caffe2/operators/generate_proposals_op.cu ++++ b/caffe2/operators/generate_proposals_op.cu +@@ -1,10 +1,11 @@ +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + #include "caffe2/core/context.h" + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/generate_proposals_op.h" + #include "caffe2/operators/generate_proposals_op_util_boxes.h" // BBOX_XFORM_CLIP_DEFAULT + #include "caffe2/operators/generate_proposals_op_util_nms.h" + #include "caffe2/operators/generate_proposals_op_util_nms_gpu.h" ++#include "caffe2/utils/cub_namespace.cuh" + + #if defined(USE_ROCM) + #include <cfloat> +diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu +index 468175df985f..952c4a772fa5 100644 +--- a/caffe2/operators/normalize_ops.cu ++++ b/caffe2/operators/normalize_ops.cu +@@ -1,6 +1,6 @@ + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu +index 86f82f78bb82..4b1e054b0806 100644 +--- a/caffe2/operators/one_hot_ops.cu ++++ b/caffe2/operators/one_hot_ops.cu +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu +index b9ed413d1e7b..372638abdd24 100644 +--- a/caffe2/operators/pack_segments.cu ++++ b/caffe2/operators/pack_segments.cu +@@ -1,6 +1,7 @@ +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/pack_segments.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu +index d29882086754..6303b70b4a89 100644 +--- a/caffe2/operators/prelu_op.cu ++++ b/caffe2/operators/prelu_op.cu +@@ -1,7 +1,7 @@ + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/prelu_op.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + namespace caffe2 { +diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu +index ba62b2eff671..d6bb862e4dbb 100644 +--- a/caffe2/operators/reduce_front_back_max_ops.cu ++++ b/caffe2/operators/reduce_front_back_max_ops.cu +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_max_ops.h" +diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +index 586c20fe8d8e..2b5cb7110edf 100644 +--- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu ++++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_sum_mean_ops.h" +diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu +index 0d94fab22a7f..9649b85d015c 100644 +--- a/caffe2/operators/reduction_ops.cu ++++ b/caffe2/operators/reduction_ops.cu +@@ -2,7 +2,7 @@ + #include "caffe2/operators/reduction_ops.h" + #include "caffe2/utils/conversions.h" + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu +index 2f6c230574b0..6b79e0c4c8b4 100644 +--- a/caffe2/operators/rmac_regions_op.cu ++++ b/caffe2/operators/rmac_regions_op.cu +@@ -1,5 +1,5 @@ +-#include <ATen/cuda/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/rmac_regions_op.h" +diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh +index 447617c6e9de..bb3f3be13c72 100644 +--- a/caffe2/operators/segment_reduction_op_gpu.cuh ++++ b/caffe2/operators/segment_reduction_op_gpu.cuh +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include <cub/device/device_reduce.cuh> + #include <cub/device/device_scan.cuh> +diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu +index e66d491f85e6..2ceb5236ef72 100644 +--- a/caffe2/operators/sequence_ops.cu ++++ b/caffe2/operators/sequence_ops.cu +@@ -1,6 +1,7 @@ + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/sequence_ops.h" +diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu +index c01fcf3e0a48..b0afac3332a6 100644 +--- a/caffe2/operators/softmax_ops.cu ++++ b/caffe2/operators/softmax_ops.cu +@@ -1,5 +1,5 @@ + #include <cfloat> +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh +index f9b9fb58adc8..6fdb4c63f8ef 100644 +--- a/caffe2/operators/spatial_batch_norm_op_impl.cuh ++++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh +@@ -5,7 +5,7 @@ + + #include <limits> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu +index 396da5195125..63d0712e3970 100644 +--- a/caffe2/sgd/adagrad_fused_op_gpu.cu ++++ b/caffe2/sgd/adagrad_fused_op_gpu.cu +@@ -2,7 +2,7 @@ + #include <c10/core/GeneratorImpl.h> + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/device/device_radix_sort.cuh> + #include "caffe2/sgd/adagrad_fused_op_gpu.cuh" + #include "caffe2/utils/math.h" +diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu +index a6fa842ddc80..0b7f499345be 100644 +--- a/caffe2/sgd/adagrad_op_gpu.cu ++++ b/caffe2/sgd/adagrad_op_gpu.cu +@@ -1,6 +1,6 @@ + #include <algorithm> + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu +index 4b59836b6a68..a93812fabbe8 100644 +--- a/caffe2/sgd/adam_op_gpu.cu ++++ b/caffe2/sgd/adam_op_gpu.cu +@@ -1,4 +1,4 @@ +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" +diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh +new file mode 100644 +index 000000000000..c7a5db0dc013 +--- /dev/null ++++ b/caffe2/utils/cub_namespace.cuh +@@ -0,0 +1,7 @@ ++#include <ATen/cuda/cub_definitions.cuh> ++ ++#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() ++namespace caffe2 { ++namespace cub = ::CUB_WRAPPED_NAMESPACE::cub; ++} ++#endif +diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu +index 20919334da50..69a6469d8ed1 100644 +--- a/caffe2/utils/math/reduce.cu ++++ b/caffe2/utils/math/reduce.cu +@@ -5,8 +5,7 @@ + #include <limits> + #include <numeric> + #include <vector> +- +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include <thrust/execution_policy.h> +diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh +index 39ad553eec76..18bdca11b9de 100644 +--- a/caffe2/utils/math/reduce.cuh ++++ b/caffe2/utils/math/reduce.cuh +@@ -1,7 +1,7 @@ + #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_ + #define CAFFE2_UTILS_MATH_REDUCE_CUH_ + +-#include <ATen/cuda/cub.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/common_gpu.h" +diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu +index b0a44fed34fb..54b0a9391c26 100644 +--- a/caffe2/utils/math_gpu.cu ++++ b/caffe2/utils/math_gpu.cu +@@ -7,8 +7,9 @@ + #include <numeric> + #include <vector> + +-#include <ATen/cuda/cub.cuh> ++#include <cub/cub.cuh> + #include <cub/block/block_reduce.cuh> ++#include "caffe2/utils/cub_namespace.cuh" + + #include <thrust/host_vector.h> + #include <thrust/device_vector.h> +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index bfee8f6dc78a..a6a3946bac8e 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1618,7 +1618,7 @@ if(NOT INTERN_BUILD_MOBILE) + set(CMAKE_CXX_STANDARD 14) + endif() + +- if(NOT ${CUDA_VERSION} LESS 11.4) ++ if(NOT ${CUDA_VERSION} LESS 11.6) + string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail") + endif() + + +From c22df90d62acfcb572b4b7bf55646fe338843a69 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 11:47:52 -0700 +Subject: [PATCH 21/30] save + +--- + aten/src/ATen/cuda/cub.cuh | 1 + + aten/src/ATen/cuda/cub_definitions.cuh | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index de14455adb98..fecf9e077a6f 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -23,6 +23,7 @@ + #undef CUB_NS_POSTFIX + #undef CUB_NS_PREFIX + #undef CUB_NS_QUALIFIER ++ + #endif + + #include <ATen/cuda/Exceptions.h> +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 07cffe9d34e4..d639d28e66b6 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -23,4 +23,4 @@ + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true + #else + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false +-#endif +\ No newline at end of file ++#endif + +From 64adfaaaca8267cd6cf03ae7a524eee239530425 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 12:29:41 -0700 +Subject: [PATCH 22/30] save + +--- + caffe2/core/context_gpu.cu | 2 +- + caffe2/operators/cross_entropy_op.cu | 2 +- + caffe2/operators/find_op.cu | 2 +- + caffe2/operators/normalize_ops.cu | 2 +- + caffe2/operators/one_hot_ops.cu | 2 +- + caffe2/operators/reduce_front_back_max_ops.cu | 2 +- + caffe2/operators/reduce_front_back_sum_mean_ops.cu | 2 +- + caffe2/operators/softmax_ops.cu | 2 +- + caffe2/sgd/adagrad_op_gpu.cu | 2 +- + caffe2/sgd/adam_op_gpu.cu | 2 +- + cmake/Dependencies.cmake | 2 +- + 11 files changed, 11 insertions(+), 11 deletions(-) + +diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu +index 9ba9f74d5376..6d537400913e 100644 +--- a/caffe2/core/context_gpu.cu ++++ b/caffe2/core/context_gpu.cu +@@ -4,7 +4,6 @@ + #include <string> + #include <unordered_map> + +-#include "caffe2/utils/cub_namespace.cuh" + #include <ATen/Context.h> + #include <c10/cuda/CUDAFunctions.h> + #include <c10/cuda/CUDACachingAllocator.h> +@@ -22,6 +21,7 @@ + #include "caffe2/core/logging.h" + #include "caffe2/core/tensor.h" + #include "caffe2/utils/string_utils.h" ++#include "caffe2/utils/cub_namespace.cuh" + + C10_DEFINE_string( + caffe2_cuda_memory_pool, +diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu +index 15cb8a4f574a..c23f05f8e5c2 100644 +--- a/caffe2/operators/cross_entropy_op.cu ++++ b/caffe2/operators/cross_entropy_op.cu +@@ -1,10 +1,10 @@ + #include <assert.h> +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/cross_entropy_op.h" + #include "caffe2/operators/operator_fallback_gpu.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu +index 20d42560b506..0418a71fbcda 100644 +--- a/caffe2/operators/find_op.cu ++++ b/caffe2/operators/find_op.cu +@@ -1,7 +1,7 @@ +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/find_op.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu +index 952c4a772fa5..e4d1f34b754c 100644 +--- a/caffe2/operators/normalize_ops.cu ++++ b/caffe2/operators/normalize_ops.cu +@@ -1,11 +1,11 @@ + #include <algorithm> + +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/normalize_l1_op.h" + #include "caffe2/operators/normalize_op.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu +index 4b1e054b0806..87e8196765ef 100644 +--- a/caffe2/operators/one_hot_ops.cu ++++ b/caffe2/operators/one_hot_ops.cu +@@ -1,8 +1,8 @@ +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/one_hot_ops.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu +index d6bb862e4dbb..2ea25de46009 100644 +--- a/caffe2/operators/reduce_front_back_max_ops.cu ++++ b/caffe2/operators/reduce_front_back_max_ops.cu +@@ -1,7 +1,7 @@ +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_max_ops.h" ++#include "caffe2/utils/cub_namespace.cuh" + + #if defined(USE_ROCM) + #include <cfloat> +diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +index 2b5cb7110edf..a7ad6dd50084 100644 +--- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu ++++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu +@@ -1,7 +1,7 @@ +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/reduce_front_back_sum_mean_ops.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu +index b0afac3332a6..ebf0700c9ef0 100644 +--- a/caffe2/operators/softmax_ops.cu ++++ b/caffe2/operators/softmax_ops.cu +@@ -1,11 +1,11 @@ + #include <cfloat> +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + + #include "caffe2/core/context_gpu.h" + #include "caffe2/operators/softmax_op.h" + #include "caffe2/operators/softmax_with_loss_op.h" + #include "caffe2/operators/spatial_softmax_with_loss_op.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu +index 0b7f499345be..b80d29700c3f 100644 +--- a/caffe2/sgd/adagrad_op_gpu.cu ++++ b/caffe2/sgd/adagrad_op_gpu.cu +@@ -1,10 +1,10 @@ + #include <algorithm> + +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" + #include "caffe2/sgd/adagrad_op.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu +index a93812fabbe8..6f9c3234204d 100644 +--- a/caffe2/sgd/adam_op_gpu.cu ++++ b/caffe2/sgd/adam_op_gpu.cu +@@ -1,8 +1,8 @@ +-#include "caffe2/utils/cub_namespace.cuh" + #include <cub/block/block_reduce.cuh> + #include "caffe2/core/common_gpu.h" + #include "caffe2/core/context_gpu.h" + #include "caffe2/sgd/adam_op.h" ++#include "caffe2/utils/cub_namespace.cuh" + + namespace caffe2 { + +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index a6a3946bac8e..5074bab1e999 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1618,7 +1618,7 @@ if(NOT INTERN_BUILD_MOBILE) + set(CMAKE_CXX_STANDARD 14) + endif() + +- if(NOT ${CUDA_VERSION} LESS 11.6) ++ if(NOT ${CUDA_VERSION} LESS 11.5) + string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail") + endif() + + +From 21abede594471e4f225def12796c1d7c73a96b20 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 12:46:32 -0700 +Subject: [PATCH 23/30] save + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index d639d28e66b6..9828abdfc99a 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -15,10 +15,10 @@ + #define CUB_SUPPORTS_NV_BFLOAT16() false + #endif + +-// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in: ++// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: + // https://github.com/NVIDIA/cub/pull/326 + // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake +-// starting from CUDA 11.6 ++// starting from CUDA 11.5 + #if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE) + #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true + #else + +From c73b101377d5adb8f96eb31988ceef7c2e760839 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Tue, 12 Oct 2021 12:59:12 -0700 +Subject: [PATCH 24/30] comment + +--- + aten/src/ATen/cuda/cub.cuh | 2 ++ + cmake/Dependencies.cmake | 2 ++ + 2 files changed, 4 insertions(+) + +diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh +index fecf9e077a6f..0532470d74ec 100644 +--- a/aten/src/ATen/cuda/cub.cuh ++++ b/aten/src/ATen/cuda/cub.cuh +@@ -13,6 +13,8 @@ + + #else + ++// include cub in a safe manner, see: ++// https://github.com/pytorch/pytorch/pull/55292 + #undef CUB_NS_POSTFIX //undef to avoid redefinition warnings + #undef CUB_NS_PREFIX + #undef CUB_NS_QUALIFIER +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 5074bab1e999..bde42bf1719f 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1618,6 +1618,8 @@ if(NOT INTERN_BUILD_MOBILE) + set(CMAKE_CXX_STANDARD 14) + endif() + ++ # use cub in a safe manner, see: ++ # https://github.com/pytorch/pytorch/pull/55292 + if(NOT ${CUDA_VERSION} LESS 11.5) + string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail") + endif() + +From 1cb29c9816e795767236854644a1ba2c28da6e0d Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Fri, 22 Oct 2021 10:42:32 -0700 +Subject: [PATCH 25/30] fix + +--- + caffe2/utils/cub_namespace.cuh | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh +index c7a5db0dc013..752f273128ee 100644 +--- a/caffe2/utils/cub_namespace.cuh ++++ b/caffe2/utils/cub_namespace.cuh +@@ -1,4 +1,21 @@ +-#include <ATen/cuda/cub_definitions.cuh> ++#pragma once ++ ++#if !defined(USE_ROCM) ++#include <cuda.h> ++#include <cub/version.cuh> ++#else ++#define CUB_VERSION 0 ++#endif ++ ++// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: ++// https://github.com/NVIDIA/cub/pull/326 ++// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake ++// starting from CUDA 11.5 ++#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE) ++#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true ++#else ++#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false ++#endif + + #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() + namespace caffe2 { + +From 3ca182d3c6d1dcc45396c3f2f3169cc84496ca1f Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Sun, 24 Oct 2021 14:27:09 -0700 +Subject: [PATCH 26/30] fix some cuda 10 build + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 9828abdfc99a..5243e551f4a7 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -1,6 +1,6 @@ + #pragma once + +-#if !defined(USE_ROCM) ++#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + #include <cuda.h> + #include <cub/version.cuh> + #else + +From 3a7c2bc6da6f2be060be879d0b2277b833ab7cc9 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Sun, 24 Oct 2021 14:28:53 -0700 +Subject: [PATCH 27/30] fix caffe2 + +--- + caffe2/utils/cub_namespace.cuh | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh +index 752f273128ee..188a9936f9c6 100644 +--- a/caffe2/utils/cub_namespace.cuh ++++ b/caffe2/utils/cub_namespace.cuh +@@ -1,12 +1,5 @@ + #pragma once + +-#if !defined(USE_ROCM) +-#include <cuda.h> +-#include <cub/version.cuh> +-#else +-#define CUB_VERSION 0 +-#endif +- + // cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: + // https://github.com/NVIDIA/cub/pull/326 + // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake + +From 960606e0dc1da7c16b6cb0ae11b04bace95816ac Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Sun, 24 Oct 2021 14:50:13 -0700 +Subject: [PATCH 28/30] fix + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 5243e551f4a7..b921af480655 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -1,7 +1,10 @@ + #pragma once + +-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 ++#if !define(USE_ROCM) + #include <cuda.h> ++#endif ++ ++#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + #include <cub/version.cuh> + #else + #define CUB_VERSION 0 + +From 48f128f55396622bfb952b474888813334fae303 Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Sun, 24 Oct 2021 14:51:12 -0700 +Subject: [PATCH 29/30] save + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index b921af480655..8c99f4951c6c 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -1,7 +1,7 @@ + #pragma once + + #if !define(USE_ROCM) +-#include <cuda.h> ++#include <cuda.h> // for CUDA_VERSION + #endif + + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + +From b2023a036afe9ddff4f710953ad6ba55f8396ecf Mon Sep 17 00:00:00 2001 +From: Xiang Gao <[email protected]> +Date: Sun, 24 Oct 2021 14:51:59 -0700 +Subject: [PATCH 30/30] fixed + +--- + aten/src/ATen/cuda/cub_definitions.cuh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh +index 8c99f4951c6c..61119fc17458 100644 +--- a/aten/src/ATen/cuda/cub_definitions.cuh ++++ b/aten/src/ATen/cuda/cub_definitions.cuh +@@ -1,6 +1,6 @@ + #pragma once + +-#if !define(USE_ROCM) ++#if !defined(USE_ROCM) + #include <cuda.h> // for CUDA_VERSION + #endif + Modified: PKGBUILD =================================================================== --- PKGBUILD 2021-10-31 22:13:18 UTC (rev 1035134) +++ PKGBUILD 2021-10-31 22:49:00 UTC (rev 1035135) @@ -2,12 +2,13 @@ # Contributor: Stephen Zhang <zsrkmyn at gmail dot com> pkgbase=python-pytorch -pkgname=("python-pytorch" "python-pytorch-opt" "python-pytorch-cuda" "python-pytorch-opt-cuda") +pkgname=("python-pytorch" "python-pytorch-cuda") _pkgname="pytorch" -pkgver=1.9.1 -_pkgver=1.9.1 -pkgrel=2 -pkgdesc="Tensors and Dynamic neural networks in Python with strong GPU acceleration" +pkgver=1.10.0 +_pkgver=1.10.0 +pkgrel=1 +_pkgdesc='Tensors and Dynamic neural networks in Python with strong GPU acceleration' +pkgdesc="${_pkgdesc}" arch=('x86_64') url="https://pytorch.org" license=('BSD') @@ -40,12 +41,15 @@ "${pkgname}-kineto::git+https://github.com/pytorch/kineto" "${pkgname}-sleef::git+https://github.com/shibatch/sleef" "${pkgname}-onnx-tensorrt::git+https://github.com/onnx/onnx-tensorrt" + "${pkgname}-pocketfft::git+https://github.com/mreineck/pocketfft" + "${pkgname}-cudnn-frontend::git+https://github.com/NVIDIA/cudnn-frontend.git" "${pkgname}-benchmark::git+https://github.com/google/benchmark.git" "${pkgname}-tbb::git+https://github.com/01org/tbb" - "${pkgname}-XNNPACK::git+https://github.com/malfet/XNNPACK.git" + "${pkgname}-XNNPACK::git+https://github.com/google/XNNPACK.git" "${pkgname}-fbjni::git+https://github.com/facebookincubator/fbjni.git" "${pkgname}-tensorpipe::git+https://github.com/pytorch/tensorpipe.git" "${pkgname}-pybind11::git+https://github.com/pybind/pybind11.git" + "${pkgname}-breakpad::git+https://github.com/driazati/breakpad.git" "${pkgname}-fbgemm::git+https://github.com/pytorch/fbgemm" "${pkgname}-googletest::git+https://github.com/google/googletest.git" "${pkgname}-zstd::git+https://github.com/facebook/zstd.git" @@ -52,14 +56,14 @@ "${pkgname}-onnx::git+https://github.com/onnx/onnx.git" "${pkgname}-protobuf::git+https://github.com/protocolbuffers/protobuf.git" "${pkgname}-fmt::git+https://github.com/fmtlib/fmt.git" + https://github.com/oneapi-src/oneDNN/commit/1fe0f2594a1bfc6386fd8f6537f971d5ae9c1214.patch + fix_old_nnapi_lite_interpreter_config.patch + fix-jit-frontend-nullptr-deref.patch fix_include_system.patch use-system-libuv.patch fix-building-for-torchvision.patch - benchmark-gcc11.patch - xnnpack-gcc11.patch - https://github.com/pytorch/pytorch/commit/c74c0c571880df886474be297c556562e95c00e0.patch fix_c10.patch - disable_non_x86_64.patch) + 66219.patch) sha256sums=('SKIP' 'SKIP' 'SKIP' @@ -95,17 +99,24 @@ 'SKIP' 'SKIP' 'SKIP' + 'SKIP' + 'SKIP' + 'SKIP' + '7728e99500d8034c837bbbe2b48b780d8563de4e56fff38a96766caad08cce05' + '21476edfa61573892a325cb8a91e13f601142e39b34e24e4575d2cdebb063b3f' + 'c272684a4c747f034163fcfd9dbb7264d5fe821dd25a060f0b791760ad0083ae' '557761502bbd994d9795bef46779e4b8c60ba0b45e7d60841f477d3b7f28a00a' 'cd9ac4aaa9f946ac5eafc57cf66c5c16b3ea7ac8af32c2558fad0705411bb669' - '689c76e89bcf403df1b4cf7ca784381967b6a6527ed6eb6d0ad6681cf789b738' - '278fecdb45df065343f51688cc7a1665153b5189f3341a741d546b0b518eac40' - '64833e96e47a22f88336381f25fcd73127208dc79e2074398295d88c4596c06a' - '3d5b9d3bbba3238d8f165e582039ec07798bccc1d1f44bd91e8b1892236cb70f' - 'ba801238afcfc58a35410e54d4ca6a638c447865c0c6b38ed16917fd6d507954' - 'd3ef8491718ed7e814fe63e81df2f49862fffbea891d2babbcb464796a1bd680') + '600bd6a4bbcec9f99ab815d82cee1c2875530b2b75f4010da5ba72ce9bf31aff' + '4d0d7da4a3fb099ed75f3007559fad04ac96eed87c523b274fb3bb6020e6b9b8' + 'd86efbe915386989d75d313fc76785e6d9c5638b983f17e98cca32174ac1fcee') +get_pyver () { + python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))' +} + prepare() { - cd "${_pkgname}-${pkgver}" + cd "${srcdir}/${_pkgname}-${pkgver}" # generated using parse-submodules git submodule init @@ -124,7 +135,7 @@ git config submodule."third_party/NNPACK_deps/FP16".url "${srcdir}/${pkgname}"-FP16 git config submodule."third_party/NNPACK_deps/psimd".url "${srcdir}/${pkgname}"-psimd git config submodule."third_party/zstd".url "${srcdir}/${pkgname}"-zstd - git config submodule."third-party/cpuinfo".url "${srcdir}/${pkgname}"-cpuinfo + git config submodule."third_party/cpuinfo".url "${srcdir}/${pkgname}"-cpuinfo git config submodule."third_party/python-enum".url "${srcdir}/${pkgname}"-enum34 git config submodule."third_party/python-peachpy".url "${srcdir}/${pkgname}"-PeachPy git config submodule."third_party/python-six".url "${srcdir}/${pkgname}"-six @@ -143,7 +154,10 @@ git config submodule."third_party/XNNPACK".url "${srcdir}/${pkgname}"-XNNPACK git config submodule."third_party/fmt".url "${srcdir}/${pkgname}"-fmt git config submodule."third_party/tensorpipe".url "${srcdir}/${pkgname}"-tensorpipe + git config submodule."third_party/cudnn_frontend".url "${srcdir}/${pkgname}"-cudnn-frontend git config submodule."third_party/kineto".url "${srcdir}/${pkgname}"-kineto + git config submodule."third_party/pocketfft".url "${srcdir}/${pkgname}"-pocketfft + git config submodule."third_party/breakpad".url "${srcdir}/${pkgname}"-breakpad git submodule update --init --recursive @@ -156,15 +170,24 @@ # fix https://github.com/pytorch/vision/issues/3695 patch -Np1 -i "${srcdir}/fix-building-for-torchvision.patch" - # GCC 11 fixes - patch -Np1 -d third_party/benchmark <../benchmark-gcc11.patch - patch -Np1 -d third_party/XNNPACK <../xnnpack-gcc11.patch - - # cuda 11.4 fix - patch -Np1 <../c74c0c571880df886474be297c556562e95c00e0.patch # cuda 11.4.1 fix patch -Np1 -i "${srcdir}/fix_c10.patch" + # https://discuss.pytorch.org/t/about-build-android-sh-lite-and-nnapi/133581 + patch -Np1 -i "${srcdir}/fix_old_nnapi_lite_interpreter_config.patch" + + # fix nullptr dereference + patch -Np1 -i "${srcdir}/fix-jit-frontend-nullptr-deref.patch" + + # disable vec tests + sed -e '/set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)/d' -i aten/CMakeLists.txt + + # https://github.com/pytorch/pytorch/issues/67153, https://github.com/pytorch/pytorch/pull/66219 + patch -Np1 -i "${srcdir}/66219.patch" + + # fix ideep/mkl-dnn + patch -Np1 -d third_party/ideep/mkl-dnn -i "${srcdir}/1fe0f2594a1bfc6386fd8f6537f971d5ae9c1214.patch" + # remove local nccl rm -rf third_party/nccl/nccl # also remove path from nccl module, so it's not checked @@ -173,11 +196,9 @@ # fix build with google-glog 0.5 https://github.com/pytorch/pytorch/issues/58054 sed -e '/namespace glog_internal_namespace_/d' -e 's|::glog_internal_namespace_||' -i c10/util/Logging.cpp - cd .. + cd "${srcdir}" - cp -a "${_pkgname}-${pkgver}" "${_pkgname}-${pkgver}-opt" - cp -a "${_pkgname}-${pkgver}" "${_pkgname}-${pkgver}-cuda" - cp -a "${_pkgname}-${pkgver}" "${_pkgname}-${pkgver}-opt-cuda" + cp -r "${_pkgname}-${pkgver}" "${_pkgname}-${pkgver}-cuda" export VERBOSE=1 export PYTORCH_BUILD_VERSION="${pkgver}" @@ -203,6 +224,8 @@ export CUDAHOSTCXX=/usr/bin/g++ export CUDA_HOST_COMPILER="${CUDAHOSTCXX}" export CUDA_HOME=/opt/cuda + # hide buildt-time CUDA devices + export CUDA_VISIBLE_DEVICES="" export CUDNN_LIB_DIR=/usr/lib export CUDNN_INCLUDE_DIR=/usr/include export TORCH_NVCC_FLAGS="-Xfatbin -compress-all" @@ -211,38 +234,24 @@ } build() { - echo "Building without cuda and without non-x86-64 optimizations" + echo "Building without cuda and with non-x86-64 optimizations" export USE_CUDA=0 export USE_CUDNN=0 cd "${srcdir}/${_pkgname}-${pkgver}" - echo "add_definitions(-march=x86-64)" >> cmake/MiscCheck.cmake - patch -Np1 -i "${srcdir}/disable_non_x86_64.patch" - python setup.py build - - - echo "Building without cuda and with non-x86-64 optimizations" - export USE_CUDA=0 - export USE_CUDNN=0 - cd "${srcdir}/${_pkgname}-${pkgver}-opt" echo "add_definitions(-march=haswell)" >> cmake/MiscCheck.cmake - python setup.py build + # this horrible hack is necessary because the current release + # ships inconsistent CMake which tries to build objects before + # thier dependencies, build twice when dependencies are available + python setup.py build || python setup.py build - echo "Building with cuda and without non-x86-64 optimizations" + echo "Building with cuda and with non-x86-64 optimizations" export USE_CUDA=1 export USE_CUDNN=1 cd "${srcdir}/${_pkgname}-${pkgver}-cuda" - patch -Np1 -i "${srcdir}/disable_non_x86_64.patch" - echo "add_definitions(-march=x86-64)" >> cmake/MiscCheck.cmake - python setup.py build - - - echo "Building with cuda and with non-x86-64 optimizations" - export USE_CUDA=1 - export USE_CUDNN=1 - cd "${srcdir}/${_pkgname}-${pkgver}-opt-cuda" echo "add_definitions(-march=haswell)" >> cmake/MiscCheck.cmake - python setup.py build + # same horrible hack as above + python setup.py build || python setup.py build } _package() { @@ -253,7 +262,7 @@ install -Dm644 LICENSE "${pkgdir}/usr/share/licenses/${pkgname}/LICENSE" - pytorchpath="usr/lib/python3.9/site-packages/torch" + pytorchpath="usr/lib/python$(get_pyver)/site-packages/torch" install -d "${pkgdir}/usr/lib" # put CMake files in correct place @@ -277,22 +286,16 @@ } package_python-pytorch() { + pkgdesc+="${_pkgdesc} (with AVX2 CPU optimizations)" + replaces=(python-pytorch-opt) cd "${srcdir}/${_pkgname}-${pkgver}" _package } -package_python-pytorch-opt() { - pkgdesc="Tensors and Dynamic neural networks in Python with strong GPU acceleration (with AVX2 CPU optimizations)" - conflicts=(python-pytorch) - provides=(python-pytorch) - - cd "${srcdir}/${_pkgname}-${pkgver}-opt" - _package -} - package_python-pytorch-cuda() { - pkgdesc="Tensors and Dynamic neural networks in Python with strong GPU acceleration (with CUDA)" + pkgdesc="${_pkgdesc} (with CUDA and AVX2 CPU optimizations)" depends+=(cuda cudnn magma) + replaces=(python-pytorch-opt-cuda) conflicts=(python-pytorch) provides=(python-pytorch) @@ -300,14 +303,4 @@ _package } -package_python-pytorch-opt-cuda() { - pkgdesc="Tensors and Dynamic neural networks in Python with strong GPU acceleration (with CUDA and AVX2 CPU optimizations)" - depends+=(cuda cudnn magma) - conflicts=(python-pytorch) - provides=(python-pytorch python-pytorch-cuda) - - cd "${srcdir}/${_pkgname}-${pkgver}-opt-cuda" - _package -} - # vim:set ts=2 sw=2 et: Deleted: benchmark-gcc11.patch =================================================================== --- benchmark-gcc11.patch 2021-10-31 22:13:18 UTC (rev 1035134) +++ benchmark-gcc11.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -1,30 +0,0 @@ -From 3d1c2677686718d906f28c1d4da001c42666e6d2 Mon Sep 17 00:00:00 2001 -From: Sergei Trofimovich <[email protected]> -Date: Thu, 15 Oct 2020 09:12:40 +0100 -Subject: [PATCH] src/benchmark_register.h: add missing <limits> inclusion - (#1060) - -Noticed missing header when was building llvm with gcc-11: - -``` -llvm-project/llvm/utils/benchmark/src/benchmark_register.h:17:30: - error: 'numeric_limits' is not a member of 'std' - 17 | static const T kmax = std::numeric_limits<T>::max(); - | ^~~~~~~~~~~~~~ -``` ---- - src/benchmark_register.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/src/benchmark_register.h b/src/benchmark_register.h -index 61377d742..204bf1d9f 100644 ---- a/src/benchmark_register.h -+++ b/src/benchmark_register.h -@@ -1,6 +1,7 @@ - #ifndef BENCHMARK_REGISTER_H - #define BENCHMARK_REGISTER_H - -+#include <limits> - #include <vector> - - #include "check.h" Deleted: disable_non_x86_64.patch =================================================================== --- disable_non_x86_64.patch 2021-10-31 22:13:18 UTC (rev 1035134) +++ disable_non_x86_64.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -1,15 +0,0 @@ -diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py -index d5db749d15..fd54cca6b8 100644 ---- a/tools/setup_helpers/cmake.py -+++ b/tools/setup_helpers/cmake.py -@@ -295,6 +295,10 @@ class CMake: - build_options.update(cmake__options) - - CMake.defines(args, -+ DISABLE_AVX2=1, -+ DISABLE_AVX512F=1, -+ DISABLE_FMA4=1, -+ DISABLE_SSE4=1, - PYTHON_EXECUTABLE=sys.executable, - PYTHON_LIBRARY=cmake_python_library, - PYTHON_INCLUDE_DIR=distutils.sysconfig.get_python_inc(), Modified: fix-building-for-torchvision.patch =================================================================== --- fix-building-for-torchvision.patch 2021-10-31 22:13:18 UTC (rev 1035134) +++ fix-building-for-torchvision.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -17,9 +17,9 @@ constexpr bool op_allowlist_check(string_view op_name) { - assert(op_name.find("::") != string_view::npos); +// assert(op_name.find("::") != string_view::npos); - #if !defined(TORCH_OPERATOR_WHITELIST) - // If the TORCH_OPERATOR_WHITELIST parameter is not defined, - // all ops are to be registered + // Use assert() instead of throw() due to a gcc bug. See: + // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function + // https://github.com/fmtlib/fmt/issues/682 -- 2.31.1 Added: fix-jit-frontend-nullptr-deref.patch =================================================================== --- fix-jit-frontend-nullptr-deref.patch (rev 0) +++ fix-jit-frontend-nullptr-deref.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -0,0 +1,12 @@ +diff --color -aur pytorch-1.10.0-old/torch/csrc/jit/frontend/ir_emitter.cpp pytorch-1.10.0-new/torch/csrc/jit/frontend/ir_emitter.cpp +--- pytorch-1.10.0-old/torch/csrc/jit/frontend/ir_emitter.cpp 2021-10-26 01:41:27.453059792 +0300 ++++ pytorch-1.10.0-new/torch/csrc/jit/frontend/ir_emitter.cpp 2021-10-26 02:00:09.783068924 +0300 +@@ -1678,7 +1678,7 @@ + << "Union type annotation `" << type_hint->repr_str() + << "` can hold " << vector_repr.str() << ", but none of " + << "those list types can hold the types of the given dict" +- << " elements, which were unified to " << candidate->repr_str(); ++ << " elements, which were unified to " << (*unified_value_type)->repr_str(); + } else { + refined_type_hint = candidate; + } Modified: fix_c10.patch =================================================================== --- fix_c10.patch 2021-10-31 22:13:18 UTC (rev 1035134) +++ fix_c10.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -7,6 +7,6 @@ static_assert( - sizeof(void*) != sizeof(int64_t) || // if 64-bit... + sizeof(void*) <= sizeof(int64_t) || // if 64-bit... - sizeof(TensorImpl) == sizeof(int64_t) * 23, + sizeof(TensorImpl) == sizeof(int64_t) * 24, "You changed the size of TensorImpl on 64-bit arch." "See Note [TensorImpl size constraints] on how to proceed."); Added: fix_old_nnapi_lite_interpreter_config.patch =================================================================== --- fix_old_nnapi_lite_interpreter_config.patch (rev 0) +++ fix_old_nnapi_lite_interpreter_config.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -0,0 +1,33 @@ +# Relevant discussion: https://discuss.pytorch.org/t/about-build-android-sh-lite-and-nnapi/133581 +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index baf9666f11..19f9a78443 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -130,7 +130,7 @@ add_subdirectory(quantized) + add_subdirectory(nnapi) + + if(BUILD_LITE_INTERPRETER) +- set(all_cpu_cpp ${generated_cpp} ${core_generated_cpp} ${cpu_kernel_cpp}) ++ set(all_cpu_cpp ${generated_cpp} ${core_generated_cpp} ${ATen_NNAPI_SRCS} ${cpu_kernel_cpp}) + append_filelist("jit_core_sources" all_cpu_cpp) + append_filelist("aten_cpu_source_non_codegen_list" all_cpu_cpp) + append_filelist("aten_native_source_non_codegen_list" all_cpu_cpp) +diff --git a/scripts/build_android.sh b/scripts/build_android.sh +index daad46e8fb..211f5bb429 100755 +--- a/scripts/build_android.sh ++++ b/scripts/build_android.sh +@@ -147,7 +147,11 @@ if [ "${ANDROID_DEBUG_SYMBOLS:-}" == '1' ]; then + fi + + if [ -n "${USE_VULKAN}" ]; then +- CMAKE_ARGS+=("-DUSE_VULKAN=ON") ++ CMAKE_ARGS+=("-DUSE_VULKAN=${USE_VULKAN}") ++fi ++ ++if [ -n "${USE_NNAPI}" ]; then ++ CMAKE_ARGS+=("-DUSE_NNAPI=${USE_NNAPI}") + fi + + # Use-specified CMake arguments go last to allow overridding defaults + + Deleted: xnnpack-gcc11.patch =================================================================== --- xnnpack-gcc11.patch 2021-10-31 22:13:18 UTC (rev 1035134) +++ xnnpack-gcc11.patch 2021-10-31 22:49:00 UTC (rev 1035135) @@ -1,33 +0,0 @@ -From 042cdaf1c24c675fca5e79eb4d2665839d7df2c2 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Nenad=20Miks=CC=8Ca?= <[email protected]> -Date: Mon, 3 May 2021 13:28:59 +0200 -Subject: [PATCH] GCC 11 no longer needs this polyfill - ---- - src/xnnpack/intrinsics-polyfill.h | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/src/xnnpack/intrinsics-polyfill.h b/src/xnnpack/intrinsics-polyfill.h -index 3f198d88..32d7d4f3 100644 ---- a/src/xnnpack/intrinsics-polyfill.h -+++ b/src/xnnpack/intrinsics-polyfill.h -@@ -11,8 +11,8 @@ - #if defined(__SSE2__) - #include <emmintrin.h> - --// GCC any, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16 --#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)) || \ -+// GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16 -+#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && __GNUC__ < 11) || \ - (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \ - (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \ - (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \ -@@ -27,7 +27,7 @@ static XNN_INTRINSIC - void _mm_storeu_si32(const void* address, __m128i v) { - *((int*) address) = _mm_cvtsi128_si32(v); - } --#endif // GCC any, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16 -+#endif // GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16 - #endif // SSE2 - - #ifdef __AVX512F__
