samskalicky commented on a change in pull request #17762: Custom Operator
Random Number Generator Support
URL: https://github.com/apache/incubator-mxnet/pull/17762#discussion_r399440931
##########
File path: example/extensions/lib_custom_op/relu_lib.cu
##########
@@ -180,6 +182,80 @@ REGISTER_OP(my_state_relu)
.setCreateOpState(createOpStateCPU, "cpu")
.setCreateOpState(createOpStateGPU, "gpu");
+/*
+ * Below is noisy ReLU operator example
+ * noisy ReLU is made from ReLU extended to include Gaussian noise
+ * forward - add Gaussian noise generated from normal distribution to each unit
+ * backward - gradient doesn't need to change since noise is constant
+ */
+
+#define NumRandomPerThread 64 // mxnet recommended random numbers generated
per thread
+
+__global__ void noisy_relu_gpu_forward(float *out, float *in, int64_t N,
mx_gpu_rand_pt states, int step) {
+ // the launcher logic ensures tid less than NumGPURandomStates
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ // each thread generates unique sequence of random numbers
+ curandStatePhilox4_32_10_t thread_state = states[tid];
+ // each thread works on <step> number of calculation
+ int start = tid * step;
+ int end = start + step;
+ for (int i=start; i<end && i<N; ++i) {
+ float noise = curand_normal(&thread_state);
+ out[i] = in[i] + noise > 0 ? in[i] + noise : 0;
+ }
+}
+
+MXReturnValue noisyForwardCPU(std::map<std::string, std::string> attrs,
+ std::vector<MXTensor> inputs,
+ std::vector<MXTensor> outputs,
+ OpResource res) {
+ float* in_data = inputs[0].data<float>();
+ float* out_data = outputs[0].data<float>();
+
+ std::mt19937 *states = res.get_cpu_rand_states();
+ std::normal_distribution<float> dist_normal;
+
+ for (int i=0; i<inputs[0].size(); ++i) {
+ float noise = dist_normal(*states);
+ out_data[i] = in_data[i] + noise > 0 ? in_data[i] + noise : 0;
+ }
+ return MX_SUCCESS;
+}
+
+MXReturnValue noisyForwardGPU(std::map<std::string, std::string> attrs,
+ std::vector<MXTensor> inputs,
+ std::vector<MXTensor> outputs,
+ OpResource res) {
+ float* in_data = inputs[0].data<float>();
+ float* out_data = outputs[0].data<float>();
+
+ mx_stream_t cuda_stream = res.get_cuda_stream();
+ int64_t N = inputs[0].size();
+
+ // below is mxnet recommended workflow to parallel random number generating
+ int num_thread = (N + NumRandomPerThread - 1) / NumRandomPerThread;
+ // we should not launch more threads than mxnet supported random number
GPU states
+ int num_thread_need = num_thread < NumGPURandomStates ? num_thread :
NumGPURandomStates;
+ // each cuda thread processes [step * tid, step * id + step) snippet of
input tensor
+ int step = (N + num_thread_need - 1) / num_thread_need;
+ // this can ensure number of parallel threads less than mxnet supported
random number states
+ int num_block = (num_thread_need + NumThreadPerBlock - 1) /
NumThreadPerBlock;
Review comment:
this is great! will be super helpful for users.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services