Commit: 5475314f4955dbc3af305577a26fe0b537380313
Author: Brecht Van Lommel
Date:   Sat Nov 4 18:06:48 2017 +0100
Branches: master
https://developer.blender.org/rB5475314f4955dbc3af305577a26fe0b537380313

Cycles: reserve CUDA local memory ahead of time.

This way we can log the amount of memory used, and it will be important
for host mapped memory support.

===================================================================

M       intern/cycles/device/device_cuda.cpp

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp 
b/intern/cycles/device/device_cuda.cpp
index 278fff02ae1..59d4fb055d0 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -234,24 +234,29 @@ public:
 
                need_texture_info = false;
 
-               /* intialize */
+               /* Intialize CUDA. */
                if(cuda_error(cuInit(0)))
                        return;
 
-               /* setup device and context */
+               /* Setup device and context. */
                if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
                        return;
 
+               /* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead 
of render,
+                * so we can predict which memory to map to host. */
+               unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+
+               /* Create context. */
                CUresult result;
 
                if(background) {
-                       result = cuCtxCreate(&cuContext, 0, cuDevice);
+                       result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
                }
                else {
-                       result = cuGLCtxCreate(&cuContext, 0, cuDevice);
+                       result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
 
                        if(result != CUDA_SUCCESS) {
-                               result = cuCtxCreate(&cuContext, 0, cuDevice);
+                               result = cuCtxCreate(&cuContext, ctx_flags, 
cuDevice);
                                background = true;
                        }
                }
@@ -542,9 +547,66 @@ public:
                if(cuda_error_(result, "cuModuleLoad"))
                        cuda_error_message(string_printf("Failed loading CUDA 
kernel %s.", filter_cubin.c_str()));
 
+               if(result == CUDA_SUCCESS) {
+                       reserve_local_memory(requested_features);
+               }
+
                return (result == CUDA_SUCCESS);
        }
 
+       void reserve_local_memory(const DeviceRequestedFeatures& 
requested_features)
+       {
+               if(use_split_kernel()) {
+                       /* Split kernel mostly uses global memory and adaptive 
compilation,
+                        * difficult to predict how much is needed currently. */
+                       return;
+               }
+
+               /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local 
memory
+                * needed for kernel launches, so that we can reliably figure 
out when
+                * to allocate scene data in mapped host memory. */
+               CUDAContextScope scope(this);
+
+               size_t total = 0, free_before = 0, free_after = 0;
+               cuMemGetInfo(&free_before, &total);
+
+               /* Get kernel function. */
+               CUfunction cuPathTrace;
+
+               if(requested_features.use_integrator_branched) {
+                       cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, 
"kernel_cuda_branched_path_trace"));
+               }
+               else {
+                       cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, 
"kernel_cuda_path_trace"));
+               }
+
+               cuda_assert(cuFuncSetCacheConfig(cuPathTrace, 
CU_FUNC_CACHE_PREFER_L1));
+
+               int min_blocks, num_threads_per_block;
+               cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, 
&num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+               /* Launch kernel, using just 1 block appears sufficient to 
reserve
+                * memory for all multiprocessors. It would be good to do this 
in
+                * parallel for the multi GPU case still to make it faster. */
+               CUdeviceptr d_work_tiles = 0;
+               uint total_work_size = 0;
+
+               void *args[] = {&d_work_tiles,
+                               &total_work_size};
+
+               cuda_assert(cuLaunchKernel(cuPathTrace,
+                                          1, 1, 1,
+                                          num_threads_per_block, 1, 1,
+                                          0, 0, args, 0));
+
+               cuda_assert(cuCtxSynchronize());
+
+               cuMemGetInfo(&free_after, &total);
+               VLOG(1) << "Local memory reserved "
+                       << string_human_readable_number(free_before - 
free_after) << " bytes. ("
+                       << string_human_readable_size(free_before - free_after) 
<< ")";
+       }
+
        void load_texture_info()
        {
                if(!info.has_fermi_limits && need_texture_info) {

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to