This is an automated email from the ASF dual-hosted git repository.
zhreshold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 838e256 Optimize NMS part 2 (#14352)
838e256 is described below
commit 838e256cbfbf0856c42b268b801f7f55271bcff2
Author: Przemyslaw Tredak <[email protected]>
AuthorDate: Thu Mar 7 18:57:08 2019 -0800
Optimize NMS part 2 (#14352)
* Optimize NMS part 2
* Guarding ldg intrinsics
---
src/operator/contrib/bounding_box-common.h | 10 +++++++
src/operator/contrib/bounding_box-inl.cuh | 44 ++++++++++++++++++++++++++++++
src/operator/contrib/bounding_box-inl.h | 27 +++++++++---------
3 files changed, 68 insertions(+), 13 deletions(-)
diff --git a/src/operator/contrib/bounding_box-common.h
b/src/operator/contrib/bounding_box-common.h
index 70215ab..4c9b1b8 100644
--- a/src/operator/contrib/bounding_box-common.h
+++ b/src/operator/contrib/bounding_box-common.h
@@ -112,6 +112,16 @@ struct nms_impl {
}
};
+namespace mshadow_op {
+struct less_than : public mxnet_op::tunable {
+ // a is x, b is sigma
+ template<typename DType>
+ MSHADOW_XINLINE static DType Map(DType a, DType b) {
+ return static_cast<DType>(a < b);
+ }
+}; // struct equal_to
+} // namespace mshadow_op
+
} // namespace op
} // namespace mxnet
diff --git a/src/operator/contrib/bounding_box-inl.cuh
b/src/operator/contrib/bounding_box-inl.cuh
index 4b7cf34..e7f5567 100644
--- a/src/operator/contrib/bounding_box-inl.cuh
+++ b/src/operator/contrib/bounding_box-inl.cuh
@@ -280,6 +280,50 @@ void NMSApply(mshadow::Stream<gpu> *s,
}
}
+__launch_bounds__(512)
+__global__ void nms_calculate_batch_start_kernel(int32_t * batch_start,
+ int32_t * valid_batch_id,
+ size_t N,
+ int num_batch) {
+ size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+ if (tid < N) {
+#if __CUDA_ARCH__ >= 350
+ const int32_t previous = tid > 0 ? __ldg(valid_batch_id + tid - 1) : -1;
+ const int32_t my = __ldg(valid_batch_id + tid);
+#else
+ const int32_t previous = tid > 0 ? valid_batch_id[tid - 1] : -1;
+ const int32_t my = valid_batch_id[tid];
+#endif
+ if (my > previous) {
+ for (int32_t current = previous + 1; current <= my; ++current) {
+ batch_start[current] = tid;
+ }
+ }
+ if (tid == N - 1) {
+ for (int32_t current = my + 1; current <= num_batch; ++current) {
+ batch_start[current] = tid + 1;
+ }
+ }
+ }
+}
+
+inline void NMSCalculateBatchStart(mshadow::Stream<gpu> *s,
+ mshadow::Tensor<gpu, 1, int32_t>*
batch_start,
+ mshadow::Tensor<gpu, 1, int32_t>*
valid_batch_id,
+ int num_batch) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ using namespace mxnet_op;
+ auto stream = mshadow::Stream<gpu>::GetStream(s);
+ constexpr int block_size = 512;
+ const int num_elements = valid_batch_id->size(0);
+ const int blocks = (num_elements + block_size - 1) / block_size;
+ nms_calculate_batch_start_kernel<<<blocks, block_size, 0,
stream>>>(batch_start->dptr_,
+
valid_batch_id->dptr_,
+
num_elements,
+
num_batch);
+}
+
} // namespace op
} // namespace mxnet
diff --git a/src/operator/contrib/bounding_box-inl.h
b/src/operator/contrib/bounding_box-inl.h
index 35ab19d..8610dcc 100644
--- a/src/operator/contrib/bounding_box-inl.h
+++ b/src/operator/contrib/bounding_box-inl.h
@@ -162,15 +162,6 @@ int FilterScores(mshadow::Tensor<cpu, 1, DType> out_scores,
return j;
}
-namespace mshadow_op {
-struct less_than : public mxnet_op::tunable {
- // a is x, b is sigma
- template<typename DType>
- MSHADOW_XINLINE static DType Map(DType a, DType b) {
- return static_cast<DType>(a < b);
- }
-}; // struct equal_to
-} // namespace mshadow_op
struct corner_to_center {
template<typename DType>
@@ -277,6 +268,19 @@ void NMSApply(mshadow::Stream<cpu> *s,
}
}
+inline void NMSCalculateBatchStart(mshadow::Stream<cpu> *s,
+ mshadow::Tensor<cpu, 1, int32_t>*
batch_start,
+ mshadow::Tensor<cpu, 1, int32_t>*
valid_batch_id,
+ int num_batch) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ using namespace mxnet_op;
+ for (int b = 0; b < num_batch + 1; b++) {
+ slice<0>(*batch_start, b, b + 1) = reduce_keepdim<red::sum, false>(
+ F<mshadow_op::less_than>(*valid_batch_id, ScalarExp<int32_t>(b)), 0);
+ }
+}
+
/*!
* \brief Assign output of nms by indexing input
*
@@ -435,10 +439,7 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
// calculate batch_start: accumulated sum to denote 1st sorted_index for a
given batch_index
valid_batch_id = (valid_sorted_index / ScalarExp<int32_t>(num_elem));
- for (int b = 0; b < num_batch + 1; b++) {
- slice<0>(batch_start, b, b + 1) = reduce_keepdim<red::sum, false>(
- F<mshadow_op::less_than>(valid_batch_id, ScalarExp<int32_t>(b)), 0);
- }
+ mxnet::op::NMSCalculateBatchStart(s, &batch_start, &valid_batch_id,
num_batch);
// pre-compute areas of candidates
areas = 0;