Commit: 9d236ac06c2b6511365eb53f84bbe366c76acc72
Author: Thomas Dinges
Date:   Thu Aug 11 22:47:53 2016 +0200
Branches: master
https://developer.blender.org/rB9d236ac06c2b6511365eb53f84bbe366c76acc72

Cycles: Enable half float support (4 channels and 1 channel) on CUDA.

Atm OpenEXR half files benefit from this and will use only 1/2 of the memory 
now. More space for HDRs!

Part of my GSoC 2016.

===================================================================

M       intern/cycles/device/device_cuda.cpp
M       intern/cycles/kernel/kernel_compat_cuda.h
M       intern/cycles/kernel/svm/svm_image.h
M       intern/cycles/util/util_half.h
M       intern/cycles/util/util_texture.h

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp 
b/intern/cycles/device/device_cuda.cpp
index a85f340..76e5249 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -576,6 +576,7 @@ public:
                        case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; 
break;
                        case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; 
break;
                        case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+                       case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
                        default: assert(0); return;
                }
 
@@ -747,8 +748,12 @@ public:
                                }
 
                                /* Resize once */
-                               if(flat_slot >= bindless_mapping.size())
-                                       bindless_mapping.resize(4096); 
/*TODO(dingto): Make this a variable */
+                               if(flat_slot >= bindless_mapping.size()) {
+                                       /* Allocate some slots in advance, to 
reduce amount
+                                        * of re-allocations.
+                                        */
+                                       bindless_mapping.resize(flat_slot + 
128);
+                               }
 
                                /* Set Mapping and tag that we need to 
(re-)upload to device */
                                bindless_mapping.get_data()[flat_slot] = 
(uint)tex;
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h 
b/intern/cycles/kernel/kernel_compat_cuda.h
index 063220b..d656fac 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -31,6 +31,7 @@
 #endif
 
 #include <cuda.h>
+#include <cuda_fp16.h>
 #include <float.h>
 
 /* Qualifier wrappers for different names on different devices */
diff --git a/intern/cycles/kernel/svm/svm_image.h 
b/intern/cycles/kernel/svm/svm_image.h
index f359829..9050ce9 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Float4 textures on various devices. */
 #if defined(__KERNEL_CPU__)
-#  define TEX_NUM_FLOAT4_IMAGES        TEX_NUM_FLOAT4_CPU
+#  define TEX_NUM_FLOAT4_IMAGES                TEX_NUM_FLOAT4_CPU
 #elif defined(__KERNEL_CUDA__)
 #  if __CUDA_ARCH__ < 300
 #    define TEX_NUM_FLOAT4_IMAGES      TEX_NUM_FLOAT4_CUDA
@@ -277,8 +277,10 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int 
id, float x, float y,
        }
 #  else
        CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+       /* float4, byte4 and half4 */
        if(id < TEX_START_FLOAT_CUDA_KEPLER)
                r = kernel_tex_image_interp_float4(tex, x, y);
+       /* float, byte and half */
        else {
                float f = kernel_tex_image_interp_float(tex, x, y);
                r = make_float4(f, f, f, 1.0);
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index ae85ab3..5db3384 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -33,17 +33,21 @@ CCL_NAMESPACE_BEGIN
 
 #else
 
+/* CUDA has its own half data type, no need to define then */
+#ifndef __KERNEL_CUDA__
 typedef unsigned short half;
+#endif
+
 struct half4 { half x, y, z, w; };
 
 #ifdef __KERNEL_CUDA__
 
 ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 {
-       h[0] = __float2half_rn(f.x * scale);
-       h[1] = __float2half_rn(f.y * scale);
-       h[2] = __float2half_rn(f.z * scale);
-       h[3] = __float2half_rn(f.w * scale);
+       h[0] = __float2half(f.x * scale);
+       h[1] = __float2half(f.y * scale);
+       h[2] = __float2half(f.z * scale);
+       h[3] = __float2half(f.w * scale);
 }
 
 #else
diff --git a/intern/cycles/util/util_texture.h 
b/intern/cycles/util/util_texture.h
index ec3ee2b..be1177d 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -52,10 +52,10 @@ CCL_NAMESPACE_BEGIN
 /* CUDA (Kepler, Geforce 6xx and above) */
 #define TEX_NUM_FLOAT4_CUDA_KEPLER             1024
 #define TEX_NUM_BYTE4_CUDA_KEPLER              1024
-#define TEX_NUM_HALF4_CUDA_KEPLER              0
+#define TEX_NUM_HALF4_CUDA_KEPLER              1024
 #define TEX_NUM_FLOAT_CUDA_KEPLER              1024
 #define TEX_NUM_BYTE_CUDA_KEPLER               1024
-#define TEX_NUM_HALF_CUDA_KEPLER               0
+#define TEX_NUM_HALF_CUDA_KEPLER               1024
 #define TEX_START_FLOAT4_CUDA_KEPLER   0
 #define TEX_START_BYTE4_CUDA_KEPLER            TEX_NUM_FLOAT4_CUDA_KEPLER
 #define TEX_START_HALF4_CUDA_KEPLER            (TEX_NUM_FLOAT4_CUDA_KEPLER + 
TEX_NUM_BYTE4_CUDA_KEPLER)

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to