Commit: 314cf40f06dca493aeefe24bb0a14c5f071d8cd4
Author: Thomas Dinges
Date:   Thu May 19 12:47:41 2016 +0200
Branches: compositor-2016
https://developer.blender.org/rB314cf40f06dca493aeefe24bb0a14c5f071d8cd4

Cycles: Add support for bindless textures.

This adds support for CUDA Texture objects (also known as Bindless textures) 
for Kepler GPUs (Geforce 6xx and above).
This is used for all 2D/3D textures, data still uses arrays as before.

User benefits:
* No more limits of image textures on Kepler.
 We had 5 float4 and 145 byte4 slots there before, now we have 1024 float4 and 
1024 byte4.
 This can be extended further if we need to (just change the define).

* Single channel textures slots (byte and float) are now supported on Kepler as 
well (1024 slots for each type).

ToDo / Issues:
* 3D textures don't work yet, at least don't show up during render. I have no 
idea whats wrong yet.
* Dynamically allocate bindless_mapping array?

I hope Fermi still works fine, but that should be tested on a Fermi card before 
pushing to master.

Part of my GSoC 2016.

Reviewers: sergey, #cycles, brecht

Subscribers: swerner, jtheninja, brecht, sergey

Differential Revision: https://developer.blender.org/D1999

===================================================================

M       intern/cycles/device/device.h
M       intern/cycles/device/device_cuda.cpp
M       intern/cycles/device/device_multi.cpp
M       intern/cycles/kernel/geom/geom_volume.h
M       intern/cycles/kernel/kernel_compat_cuda.h
M       intern/cycles/kernel/kernel_textures.h
M       intern/cycles/kernel/svm/svm_image.h
M       intern/cycles/kernel/svm/svm_voxel.h
M       intern/cycles/render/image.cpp
M       intern/cycles/util/util_texture.h

===================================================================

diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 4c1b722..e11bb7f 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -54,7 +54,7 @@ public:
        bool display_device;
        bool advanced_shading;
        bool pack_images;
-       bool extended_images; /* flag for GPU and Multi device */
+       bool has_bindless_textures; /* flag for GPU and Multi device */
        bool use_split_kernel; /* Denotes if the device is going to run cycles 
using split-kernel */
        vector<DeviceInfo> multi_devices;
 
@@ -66,7 +66,7 @@ public:
                display_device = false;
                advanced_shading = true;
                pack_images = false;
-               extended_images = false;
+               has_bindless_textures = false;
                use_split_kernel = false;
        }
 };
@@ -230,6 +230,7 @@ public:
                (void)interpolation;  /* Ignored. */
                (void)extension;  /* Ignored. */
        };
+
        virtual void tex_free(device_memory& /*mem*/) {};
 
        /* pixel memory */
diff --git a/intern/cycles/device/device_cuda.cpp 
b/intern/cycles/device/device_cuda.cpp
index 12c62c0..39bb442 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -85,10 +85,10 @@ public:
        CUcontext cuContext;
        CUmodule cuModule;
        map<device_ptr, bool> tex_interp_map;
+       map<device_ptr, uint> tex_bindless_map;
        int cuDevId;
        int cuDevArchitecture;
        bool first_error;
-       bool use_texture_storage;
 
        struct PixelMem {
                GLuint cuPBO;
@@ -99,6 +99,10 @@ public:
 
        map<device_ptr, PixelMem> pixel_mem_map;
 
+       /* Bindless Textures */
+       device_vector<uint> bindless_mapping;
+       bool need_bindless_mapping;
+
        CUdeviceptr cuda_device_ptr(device_ptr mem)
        {
                return (CUdeviceptr)mem;
@@ -176,12 +180,13 @@ public:
        {
                first_error = true;
                background = background_;
-               use_texture_storage = true;
 
                cuDevId = info.num;
                cuDevice = 0;
                cuContext = 0;
 
+               need_bindless_mapping = false;
+
                /* intialize */
                if(cuda_error(cuInit(0)))
                        return;
@@ -211,11 +216,6 @@ public:
                cuDeviceComputeCapability(&major, &minor, cuDevId);
                cuDevArchitecture = major*100 + minor*10;
 
-               /* In order to use full 6GB of memory on Titan cards, use 
arrays instead
-                * of textures. On earlier cards this seems slower, but on 
Titan it is
-                * actually slightly faster in tests. */
-               use_texture_storage = (cuDevArchitecture < 300);
-
                cuda_pop_context();
        }
 
@@ -223,6 +223,10 @@ public:
        {
                task_pool.stop();
 
+               if(info.has_bindless_textures) {
+                       tex_free(bindless_mapping);
+               }
+
                cuda_assert(cuCtxDestroy(cuContext));
        }
 
@@ -400,6 +404,15 @@ public:
                return (result == CUDA_SUCCESS);
        }
 
+       void load_bindless_mapping()
+       {
+               if(info.has_bindless_textures && need_bindless_mapping) {
+                       tex_free(bindless_mapping);
+                       tex_alloc("__bindless_mapping", bindless_mapping, 
INTERPOLATION_NONE, EXTENSION_REPEAT);
+                       need_bindless_mapping = false;
+               }
+       }
+
        void mem_alloc(device_memory& mem, MemoryType /*type*/)
        {
                cuda_push_context();
@@ -479,126 +492,99 @@ public:
        {
                VLOG(1) << "Texture allocate: " << name << ", " << 
mem.memory_size() << " bytes.";
 
+               /* Check if we are on sm_30 or above.
+                * We use arrays and bindles textures for storage there */
+               bool has_bindless_textures = info.has_bindless_textures;
+
+               /* General variables for both architectures */
                string bind_name = name;
-               if(mem.data_depth > 1) {
-                       /* Kernel uses different bind names for 2d and 3d float 
textures,
-                        * so we have to adjust couple of things here.
-                        */
-                       vector<string> tokens;
-                       string_split(tokens, name, "_");
-                       bind_name = string_printf("__tex_image_%s_3d_%s",
-                                                 tokens[2].c_str(),
-                                                 tokens[3].c_str());
+               size_t dsize = datatype_size(mem.data_type);
+               size_t size = mem.memory_size();
+
+               CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+               switch(extension) {
+                       case EXTENSION_REPEAT:
+                               address_mode = CU_TR_ADDRESS_MODE_WRAP;
+                               break;
+                       case EXTENSION_EXTEND:
+                               address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+                               break;
+                       case EXTENSION_CLIP:
+                               address_mode = CU_TR_ADDRESS_MODE_BORDER;
+                               break;
+                       default:
+                               assert(0);
+                               break;
+               }
+
+               CUfilter_mode filter_mode;
+               if(interpolation == INTERPOLATION_CLOSEST) {
+                       filter_mode = CU_TR_FILTER_MODE_POINT;
+               }
+               else {
+                       filter_mode = CU_TR_FILTER_MODE_LINEAR;
                }
 
-               /* determine format */
                CUarray_format_enum format;
-               size_t dsize = datatype_size(mem.data_type);
-               size_t size = mem.memory_size();
-               bool use_texture = (interpolation != INTERPOLATION_NONE) || 
use_texture_storage;
+               switch(mem.data_type) {
+                       case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; 
break;
+                       case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; 
break;
+                       case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; 
break;
+                       case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+                       default: assert(0); return;
+               }
 
-               if(use_texture) {
+               /* General variables for Fermi */
+               CUtexref texref = NULL;
 
-                       switch(mem.data_type) {
-                               case TYPE_UCHAR: format = 
CU_AD_FORMAT_UNSIGNED_INT8; break;
-                               case TYPE_UINT: format = 
CU_AD_FORMAT_UNSIGNED_INT32; break;
-                               case TYPE_INT: format = 
CU_AD_FORMAT_SIGNED_INT32; break;
-                               case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; 
break;
-                               default: assert(0); return;
+               if(!has_bindless_textures) {
+                       if(mem.data_depth > 1) {
+                               /* Kernel uses different bind names for 2d and 
3d float textures,
+                                * so we have to adjust couple of things here.
+                                */
+                               vector<string> tokens;
+                               string_split(tokens, name, "_");
+                               bind_name = 
string_printf("__tex_image_%s_3d_%s",
+                                                         tokens[2].c_str(),
+                                                         tokens[3].c_str());
                        }
 
-                       CUtexref texref = NULL;
-
                        cuda_push_context();
                        cuda_assert(cuModuleGetTexRef(&texref, cuModule, 
bind_name.c_str()));
+                       cuda_pop_context();
 
                        if(!texref) {
-                               cuda_pop_context();
                                return;
                        }
+               }
 
-                       if(interpolation != INTERPOLATION_NONE) {
-                               CUarray handle = NULL;
-
-                               if(mem.data_depth > 1) {
-                                       CUDA_ARRAY3D_DESCRIPTOR desc;
-
-                                       desc.Width = mem.data_width;
-                                       desc.Height = mem.data_height;
-                                       desc.Depth = mem.data_depth;
-                                       desc.Format = format;
-                                       desc.NumChannels = mem.data_elements;
-                                       desc.Flags = 0;
-
-                                       cuda_assert(cuArray3DCreate(&handle, 
&desc));
-                               }
-                               else {
-                                       CUDA_ARRAY_DESCRIPTOR desc;
-
-                                       desc.Width = mem.data_width;
-                                       desc.Height = mem.data_height;
-                                       desc.Format = format;
-                                       desc.NumChannels = mem.data_elements;
-
-                                       cuda_assert(cuArrayCreate(&handle, 
&desc));
-                               }
+               /* Data Storage */
+               if(interpolation == INTERPOLATION_NONE) {
+                       if(has_bindless_textures) {
+                               mem_alloc(mem, MEM_READ_ONLY);
+                               mem_copy_to(mem);
 
-                               if(!handle) {
-                                       cuda_pop_context();
-                                       return;
-                               }
+                               cuda_push_context();
 
-                               if(mem.data_depth > 1) {
-                                       CUDA_MEMCPY3D param;
-                                       memset(&param, 0, sizeof(param));
-                                       param.dstMemoryType = 
CU_MEMORYTYPE_ARRAY;
-                                       param.dstArray = handle;
-                                       param.srcMemoryType = 
CU_MEMORYTYPE_HOST;
-                                       param.srcHost = (void*)mem.data_pointer;
-                                       param.srcPitch = 
mem.data_width*dsize*mem.data_elements;
-                                       param.WidthInBytes = param.srcPitch;
-                                       param.Height = mem.data_height;
-                                       param.Depth = mem.data_depth;
-
-                                       cuda_assert(cuMemcpy3D(&param));
-                               }
-                               else if(mem.data_height > 1) {
-                                       CUDA_MEMCPY2D param;
-                                       memset(&param, 0, sizeof(param));
-                                       param.dstMemoryType = 
CU_MEMORYTYPE_ARRAY;
-                                       param.dstArray = handle;
-                                       param.srcMemoryType = 
CU_MEMORYTYPE_HOST;
-                                       param.srcHost = (void*)mem.data_pointer;
-                                       param.srcPitch = 
mem.data_width*dsize*mem.data_elements;
-                                       param.WidthInBytes = param.srcPitch;
-                                       param.Height = mem.data_height;
-
-                                       cuda_assert(cuMemcpy2D(&param));
-                               }
-                               else
-                                       cuda_assert(cuMemcpyHtoA(handle, 0, 
(void*)mem.data_pointer, size));
+                               CUdeviceptr cumem;
+                               size_t cubytes;
 
-                               cuda_assert(cuTexRefSetArray(texref, handle, 
CU_TRSA_OVERRIDE_FORMAT));
+                               cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, 
cuModule, bind_name.c_str()));
 
-                               if(interpolation == INTERPOLATION_CLOSEST) {
-                                       
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+                               if(cubytes == 8) {
+                                       /* 64 bit device pointer */
+                                       uint64_t ptr = mem.device_pointer;
+                                       cuda_assert(cuMemcpyHtoD(cumem, 
(void*)&ptr, cubytes));
                                }
-                               else if(interpolation == INTERPOLATION_LINEAR) {
-                                       
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
-                               }
-                               else {/* CUBIC and SMART are unsupported for 
CUDA */
-                                       
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+                               else {
+                                       /* 32 bit device pointer */
+                                       uint32_t ptr = 
(uint32_t)mem.device_pointer;
+                                       cuda_assert(cuMemcpyHtoD(cumem, 
(void*)&ptr, cubytes));
                                }
-                               cuda_assert(cuTexRefSetFlags(texref, 
CU_TRSF_NORMALIZED_COORDINATES));
-
-                               mem.device_pointer = (device_ptr)handle;
-                               mem.device_size = size;
 
-                               stats.mem_alloc(size);
+                               cuda_pop_context();
                        }
                        else {
-                               cuda_pop_context();
-
                                mem_alloc(mem, MEM_READ_ONLY);
                                mem_copy_to(mem);
 
@@ -607,58 +593,149 @@ public:
                                cuda_assert(cuTexRefSetAddress(NULL, texref, 
cuda_device_ptr(mem.device_pointer), size));
                                cuda_assert(cuTexRefSetFilterMode(texref, 
CU_TR_FILTER_MODE_POINT));
                                cuda_assert(cuTexRefSetFlags(texref, 
CU_TRSF_READ_AS_INTEGER));
+
+                               cuda_pop_context();
                        }
+               }
+               /* Texture Storage */
+               else {
+                       CUarray handle = NULL;
 
-                       CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-                       switch(extension) {
-                               case EXTENSION_REPEAT:
-                                       address_mode = CU_TR_ADDRESS_MODE_WRAP;
-                                       break;
-                               case EXTENSION_EXTEND:
-                                       address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-                                       break;
-                               case EXTENSION_CLIP:
-                                       address_mode = 
CU_TR_ADDRESS_MODE_BORDER;
-                                       break;
-                               default:
-                                       assert(0);
-                                       break;
+                       cuda_push_context();
+
+                       if(mem.data_depth > 1) {
+                               CUDA_ARRAY3D_DESCRIPTOR desc;
+
+                               desc.Width = mem.data_width;
+                               desc.Height = mem.data_height;
+                               desc.Depth = mem.data_depth;
+                               desc.Format = format;
+                               desc.NumChannels = mem.data_elements;
+                               desc.Flags = 0;
+
+                               cuda_assert(cuArray3DCreate(&handle, &desc

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to