Commit: f64fe321fa65f7c36ebf7104b84868d1bb65a2fc Author: Sergey Sharybin Date: Tue Jul 27 12:13:46 2021 +0200 Branches: cycles-x https://developer.blender.org/rBf64fe321fa65f7c36ebf7104b84868d1bb65a2fc
Merge branch 'master' into cycles-x =================================================================== =================================================================== diff --cc intern/cycles/device/cuda/device_impl.cpp index 0eaf787dbd7,00000000000..37fab8f8293 mode 100644,000000..100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@@ -1,1370 -1,0 +1,1370 @@@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/cuda/device_impl.h" + +# include "render/buffers.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_time.h" +# include "util/util_types.h" +# include "util/util_windows.h" + +CCL_NAMESPACE_BEGIN + +class CUDADevice; + +bool CUDADevice::have_precompiled_kernels() +{ + string cubins_path = path_get("lib"); + return path_exists(cubins_path); +} + +bool CUDADevice::show_samples() const +{ + /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask CUDADevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void CUDADevice::set_error(const string &error) +{ + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + first_error = true; + + cuDevId = info.num; + cuDevice = 0; + cuContext = 0; + + cuModule = 0; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + pitch_alignment = 0; + + /* Initialize CUDA. */ + CUresult result = cuInit(0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result))); + return; + } + + /* Setup device and context. */ + result = cuDeviceGet(&cuDevice, cuDevId); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)", + cuewErrorString(result))); + return; + } + + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ + cuda_assert( + cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + + cuda_assert(cuDeviceGetAttribute( + &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if (can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } + + /* Create context. */ + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result))); + return; + } + + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + cuDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); +} + +CUDADevice::~CUDADevice() +{ + texture_info.free(); + + cuda_assert(cuCtxDestroy(cuContext)); +} + +bool CUDADevice::support_device(const uint /*kernel_features*/) +{ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* We only support sm_30 and above */ + if (major < 3) { + set_error(string_printf( + "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor)); + return false; + } + + return true; +} + +bool CUDADevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device); + + int can_access = 0; + cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + cuda_assert(cuDeviceGetP2PAttribute(&can_access, + CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED, + cuDevice, + peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const CUDAContextScope scope(this); + CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); + return false; + } + } + { + const CUDAContextScope scope(peer_device_cuda); + CUresult result = cuCtxEnablePeerAccess(cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); + return false; + } + } + + return true; +} + +bool CUDADevice::use_adaptive_compilation() +{ + return DebugFlags().cuda.adaptive_compile; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DNVCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (use_adaptive_compilation()) { + cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); + if (extra_cflags) { + cflags += string(" ") + string(extra_cflags); + } + +# ifdef WITH_NANOVDB + cflags += " -DWITH_NANOVDB"; +# endif + + return cflags; +} + +string CUDADevice::compile_kernel(const uint kernel_features, + const char *name, + const char *base, + bool force_ptx) +{ + /* Compute kernel name. */ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + if (!force_ptx) { + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using precompiled kernel."; + return cubin; + } + } + + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } + } + } + + /* Try to use locally compiled kernel. */ + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing cuda toolkit or changing other + * compiler command line arguments makes sure cubin gets re-built. + */ + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const char *const kernel_ext = force_ptx ? "ptx" : "cubin"; + const char *const kernel_arch = force_ptx ? "compute" : "sm"; + const string cubin_file = string_printf( + "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext); + const string cubin = path_cache_get(path_join("kernels", cubin_file)); + VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; + if (path_ @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
