apeforest commented on a change in pull request #17882: URL: https://github.com/apache/incubator-mxnet/pull/17882#discussion_r445868759
########## File path: src/operator/tensor/broadcast_reduce_op.h ########## @@ -1114,6 +1124,92 @@ struct broadcast_kernel { } }; +/** + * Changed the thread workload mapping from 1 + * thread/output element to 1 thread/input to be broadcasted + * This approach leverages vectorization when fastest varying + * index(stride=1) of the tensor is to be broadcasted. + * In other cases it simply performs better by better load balancing. + */ +template<typename OP> +struct broadcast_kernel_cpu { + template<typename IType, typename OType> + MSHADOW_XINLINE static void Map(index_t i, + IType *input, + OType *output, + const ShapeAndStride& aux_data, + const OpReqType req, + const int ndim) { + index_t idx = i; + index_t init_off = 0; + for (int iter = ndim - 1; idx > 0 && iter >= 0; --iter) { + size_t dim_idx = idx % aux_data.input_shape[iter]; + init_off += dim_idx * aux_data.out_stride[iter]; + idx /= aux_data.input_shape[iter]; + } + index_t stride_0, stride_1, stride_2; + // Each case is based on the number of axis to be broadcasted + // (1, 2 or 3) after merging axes. + switch (aux_data.num_broadcast_axes) { + // when input shape is one of the follwing forms + // (x_1,1) or (x_1,1,x_2) or (1,x_1) + // x_1, x_2 are size of the dimensions that are not to be broadcasted + // in case of (x_1,1) the system leverages vectorization but in other 2 + // the performance is improved due avoidance of duplicate stride calculations + // for each output location input[i] needs to be written to. + case 1 : + stride_0 = aux_data.out_stride[aux_data.axes[0]]; + for (index_t l=0; l < aux_data.output_shape[aux_data.axes[0]]; l++) { + KERNEL_ASSIGN(output[init_off + l*stride_0], + req, OP::Map(input[i])); + } + break; + // when input shape is one of the follwing forms + // (x_1,1,x_2,1) or (1,x_1,1,x_2) or (x_1,1,x_2,1,x_3) + // x_1, x_2, x_3 are size of the dimensions that are not to be broadcasted + // in the inner most loop can be vectorized by compiler in outer loops + // the performance is improved due avoidance of duplicate stride calculations + // for each output location input[i] needs to be written to. + case 2: + stride_1 = aux_data.out_stride[aux_data.axes[1]]; + stride_0 = aux_data.out_stride[aux_data.axes[0]]; + for (index_t k=0; k < aux_data.output_shape[aux_data.axes[1]]; k++) { + for (index_t l=0; l < aux_data.output_shape[aux_data.axes[0]]; l++) { + KERNEL_ASSIGN(output[init_off + k*stride_1 + l*stride_0], + req, OP::Map(input[i])); + } + } + break; + // when input shape is of the form (1,x_1,1,x_2,1) + // x_1, x_2 are size of the dimensions that are not to be broadcasted + // here the last axis which is [4] is the one where compiler can vectorize + // the code the outer 2 loops improve preformance by avoiding + // duplicate stride calculations + // for each output location input[i] needs to be written to. + case 3: + stride_2 = aux_data.out_stride[aux_data.axes[2]]; + stride_1 = aux_data.out_stride[aux_data.axes[1]]; + stride_0 = aux_data.out_stride[aux_data.axes[0]]; + for (index_t j=0; j < aux_data.output_shape[aux_data.axes[2]]; j++) { Review comment: nit: add space between "=" ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org