Commit: 314cf40f06dca493aeefe24bb0a14c5f071d8cd4 Author: Thomas Dinges Date: Thu May 19 12:47:41 2016 +0200 Branches: compositor-2016 https://developer.blender.org/rB314cf40f06dca493aeefe24bb0a14c5f071d8cd4
Cycles: Add support for bindless textures. This adds support for CUDA Texture objects (also known as Bindless textures) for Kepler GPUs (Geforce 6xx and above). This is used for all 2D/3D textures, data still uses arrays as before. User benefits: * No more limits of image textures on Kepler. We had 5 float4 and 145 byte4 slots there before, now we have 1024 float4 and 1024 byte4. This can be extended further if we need to (just change the define). * Single channel textures slots (byte and float) are now supported on Kepler as well (1024 slots for each type). ToDo / Issues: * 3D textures don't work yet, at least don't show up during render. I have no idea whats wrong yet. * Dynamically allocate bindless_mapping array? I hope Fermi still works fine, but that should be tested on a Fermi card before pushing to master. Part of my GSoC 2016. Reviewers: sergey, #cycles, brecht Subscribers: swerner, jtheninja, brecht, sergey Differential Revision: https://developer.blender.org/D1999 =================================================================== M intern/cycles/device/device.h M intern/cycles/device/device_cuda.cpp M intern/cycles/device/device_multi.cpp M intern/cycles/kernel/geom/geom_volume.h M intern/cycles/kernel/kernel_compat_cuda.h M intern/cycles/kernel/kernel_textures.h M intern/cycles/kernel/svm/svm_image.h M intern/cycles/kernel/svm/svm_voxel.h M intern/cycles/render/image.cpp M intern/cycles/util/util_texture.h =================================================================== diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 4c1b722..e11bb7f 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -54,7 +54,7 @@ public: bool display_device; bool advanced_shading; bool pack_images; - bool extended_images; /* flag for GPU and Multi device */ + bool has_bindless_textures; /* flag for GPU and Multi device */ bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */ vector<DeviceInfo> multi_devices; @@ -66,7 +66,7 @@ public: display_device = false; advanced_shading = true; pack_images = false; - extended_images = false; + has_bindless_textures = false; use_split_kernel = false; } }; @@ -230,6 +230,7 @@ public: (void)interpolation; /* Ignored. */ (void)extension; /* Ignored. */ }; + virtual void tex_free(device_memory& /*mem*/) {}; /* pixel memory */ diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 12c62c0..39bb442 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -85,10 +85,10 @@ public: CUcontext cuContext; CUmodule cuModule; map<device_ptr, bool> tex_interp_map; + map<device_ptr, uint> tex_bindless_map; int cuDevId; int cuDevArchitecture; bool first_error; - bool use_texture_storage; struct PixelMem { GLuint cuPBO; @@ -99,6 +99,10 @@ public: map<device_ptr, PixelMem> pixel_mem_map; + /* Bindless Textures */ + device_vector<uint> bindless_mapping; + bool need_bindless_mapping; + CUdeviceptr cuda_device_ptr(device_ptr mem) { return (CUdeviceptr)mem; @@ -176,12 +180,13 @@ public: { first_error = true; background = background_; - use_texture_storage = true; cuDevId = info.num; cuDevice = 0; cuContext = 0; + need_bindless_mapping = false; + /* intialize */ if(cuda_error(cuInit(0))) return; @@ -211,11 +216,6 @@ public: cuDeviceComputeCapability(&major, &minor, cuDevId); cuDevArchitecture = major*100 + minor*10; - /* In order to use full 6GB of memory on Titan cards, use arrays instead - * of textures. On earlier cards this seems slower, but on Titan it is - * actually slightly faster in tests. */ - use_texture_storage = (cuDevArchitecture < 300); - cuda_pop_context(); } @@ -223,6 +223,10 @@ public: { task_pool.stop(); + if(info.has_bindless_textures) { + tex_free(bindless_mapping); + } + cuda_assert(cuCtxDestroy(cuContext)); } @@ -400,6 +404,15 @@ public: return (result == CUDA_SUCCESS); } + void load_bindless_mapping() + { + if(info.has_bindless_textures && need_bindless_mapping) { + tex_free(bindless_mapping); + tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT); + need_bindless_mapping = false; + } + } + void mem_alloc(device_memory& mem, MemoryType /*type*/) { cuda_push_context(); @@ -479,126 +492,99 @@ public: { VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes."; + /* Check if we are on sm_30 or above. + * We use arrays and bindles textures for storage there */ + bool has_bindless_textures = info.has_bindless_textures; + + /* General variables for both architectures */ string bind_name = name; - if(mem.data_depth > 1) { - /* Kernel uses different bind names for 2d and 3d float textures, - * so we have to adjust couple of things here. - */ - vector<string> tokens; - string_split(tokens, name, "_"); - bind_name = string_printf("__tex_image_%s_3d_%s", - tokens[2].c_str(), - tokens[3].c_str()); + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch(extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if(interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; } - /* determine format */ CUarray_format_enum format; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage; + switch(mem.data_type) { + case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; + case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; + case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; + case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + default: assert(0); return; + } - if(use_texture) { + /* General variables for Fermi */ + CUtexref texref = NULL; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - default: assert(0); return; + if(!has_bindless_textures) { + if(mem.data_depth > 1) { + /* Kernel uses different bind names for 2d and 3d float textures, + * so we have to adjust couple of things here. + */ + vector<string> tokens; + string_split(tokens, name, "_"); + bind_name = string_printf("__tex_image_%s_3d_%s", + tokens[2].c_str(), + tokens[3].c_str()); } - CUtexref texref = NULL; - cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); + cuda_pop_context(); if(!texref) { - cuda_pop_context(); return; } + } - if(interpolation != INTERPOLATION_NONE) { - CUarray handle = NULL; - - if(mem.data_depth > 1) { - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - cuda_assert(cuArray3DCreate(&handle, &desc)); - } - else { - CUDA_ARRAY_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Format = format; - desc.NumChannels = mem.data_elements; - - cuda_assert(cuArrayCreate(&handle, &desc)); - } + /* Data Storage */ + if(interpolation == INTERPOLATION_NONE) { + if(has_bindless_textures) { + mem_alloc(mem, MEM_READ_ONLY); + mem_copy_to(mem); - if(!handle) { - cuda_pop_context(); - return; - } + cuda_push_context(); - if(mem.data_depth > 1) { - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - } - else if(mem.data_height > 1) { - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2D(¶m)); - } - else - cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); + CUdeviceptr cumem; + size_t cubytes; - cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - if(interpolation == INTERPOLATION_CLOSEST) { - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); + if(cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - else if(interpolation == INTERPOLATION_LINEAR) { - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); - } - else {/* CUBIC and SMART are unsupported for CUDA */ - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); + else { + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); - - mem.device_pointer = (device_ptr)handle; - mem.device_size = size; - stats.mem_alloc(size); + cuda_pop_context(); } else { - cuda_pop_context(); - mem_alloc(mem, MEM_READ_ONLY); mem_copy_to(mem); @@ -607,58 +593,149 @@ public: cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); + + cuda_pop_context(); } + } + /* Texture Storage */ + else { + CUarray handle = NULL; - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch(extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; + cuda_push_context(); + + if(mem.data_depth > 1) { + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + cuda_assert(cuArray3DCreate(&handle, &desc @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] https://lists.blender.org/mailman/listinfo/bf-blender-cvs
