Hi Jürgen, My understanding is that -O3 only applies to host (CPU) code, and we don't actually generate any such code with nvcc. The cubin files are exactly the same here with/without -O3.
On Wed, Jun 19, 2013 at 8:50 PM, Jürgen Herrmann <[email protected]> wrote: > Hi Brecht, > > nice work on Cycles performance in the last days, thank you very much ;-) > I realized that you didn't introduce the -O3 flag for nvcc yet. Are there > problem when using it? > This could boost performance a little bit more on all platforms. > > /Jürgen > > -----Ursprüngliche Nachricht----- > Von: [email protected] > [mailto:[email protected]] Im Auftrag von Brecht Van Lommel > Gesendet: Mittwoch, 19. Juni 2013 19:54 > An: [email protected] > Betreff: [Bf-blender-cvs] SVN commit: /data/svn/bf-blender [57580] > trunk/blender/intern/cycles: Cycles: prepare to make CUDA 5.0 the official > version we use > > Revision: 57580 > > http://projects.blender.org/scm/viewvc.php?view=rev&root=bf-blender&revision > =57580 > Author: blendix > Date: 2013-06-19 17:54:23 +0000 (Wed, 19 Jun 2013) > Log Message: > ----------- > Cycles: prepare to make CUDA 5.0 the official version we use > > * Add CUDA compiler version detection to cmake/scons/runtime > * Remove noinline in kernel_shader.h and reenable --use_fast_math if CUDA > 5.x > is used, these were workarounds for CUDA 4.2 bugs > * Change max number of registers to 32 for sm 2.x (based on performance > tests > from Martijn Berger and confirmed here), and also for NVidia OpenCL. > > Overall it seems that with these changes and the latest CUDA 5.0 download, > that performance is as good as or better than the 2.67b release with the > scenes and graphics cards I tested. > > Modified Paths: > -------------- > trunk/blender/intern/cycles/device/device_cuda.cpp > trunk/blender/intern/cycles/device/device_opencl.cpp > trunk/blender/intern/cycles/kernel/CMakeLists.txt > trunk/blender/intern/cycles/kernel/SConscript > trunk/blender/intern/cycles/kernel/kernel_jitter.h > trunk/blender/intern/cycles/kernel/kernel_shader.h > trunk/blender/intern/cycles/util/util_cuda.cpp > trunk/blender/intern/cycles/util/util_cuda.h > > Modified: trunk/blender/intern/cycles/device/device_cuda.cpp > =================================================================== > --- trunk/blender/intern/cycles/device/device_cuda.cpp 2013-06-19 17:17:51 > UTC (rev 57579) > +++ trunk/blender/intern/cycles/device/device_cuda.cpp 2013-06-19 17:54:23 > UTC (rev 57580) > @@ -271,21 +271,65 @@ > return ""; > } > > + int cuda_version = cuCompilerVersion(); > + > + if(cuda_version == 0) { > + cuda_error_message("CUDA nvcc compiler version could > not be parsed."); > + return ""; > + } > + > + if(cuda_version != 50) > + printf("CUDA version %d.%d detected, build may > succeed but only CUDA > +5.0 is officially supported.\n", cuda_version/10, cuda_version%10); > + > /* compile */ > string kernel = path_join(kernel_path, "kernel.cu"); > string include = kernel_path; > const int machine = system_cpu_bits(); > - const int maxreg = 24; > + string arch_flags; > > + /* build flags depending on CUDA version and arch */ > + if(cuda_version < 50) { > + /* CUDA 4.x */ > + if(major == 1) { > + /* sm_1x */ > + arch_flags = "--maxrregcount=24 > --opencc-options -OPT:Olimit=0"; > + } > + else if(major == 2) { > + /* sm_2x */ > + arch_flags = "--maxrregcount=24"; > + } > + else { > + /* sm_3x */ > + arch_flags = "--maxrregcount=32"; > + } > + } > + else { > + /* CUDA 4.x */ > + if(major == 1) { > + /* sm_1x */ > + arch_flags = "--maxrregcount=24 > --opencc-options -OPT:Olimit=0 --use_fast_math"; > + } > + else if(major == 2) { > + /* sm_2x */ > + arch_flags = "--maxrregcount=32 > --use_fast_math"; > + } > + else { > + /* sm_3x */ > + arch_flags = "--maxrregcount=32 > --use_fast_math"; > + } > + } > + > double starttime = time_dt(); > printf("Compiling CUDA kernel ...\n"); > > path_create_directories(cubin); > > string command = string_printf("\"%s\" -arch=sm_%d%d -m%d > --cubin \"%s\" " > - "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d > --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC", > - nvcc.c_str(), major, minor, machine, kernel.c_str(), > cubin.c_str(), maxreg, include.c_str()); > + "-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC > -D__KERNEL_CUDA_VERSION__=%d", > + nvcc.c_str(), major, minor, machine, kernel.c_str(), > cubin.c_str(), > +arch_flags.c_str(), include.c_str(), cuda_version); > > + printf("%s\n", command.c_str()); > + > if(system(command.c_str()) == -1) { > cuda_error_message("Failed to execute compilation > command, see console for details."); > return ""; > > Modified: trunk/blender/intern/cycles/device/device_opencl.cpp > =================================================================== > --- trunk/blender/intern/cycles/device/device_opencl.cpp 2013-06-19 > 17:17:51 UTC (rev 57579) > +++ trunk/blender/intern/cycles/device/device_opencl.cpp 2013-06-19 > 17:54:23 UTC (rev 57580) > @@ -85,7 +85,7 @@ > string build_options = " -cl-fast-relaxed-math "; > > if(platform == "NVIDIA CUDA") > - build_options += "-D__KERNEL_OPENCL_NVIDIA__ > -cl-nv-maxrregcount=24 -cl-nv-verbose "; > + build_options += "-D__KERNEL_OPENCL_NVIDIA__ > -cl-nv-maxrregcount=32 > +-cl-nv-verbose "; > > else if(platform == "Apple") > build_options += "-D__KERNEL_OPENCL_APPLE__ > -Wno-missing-prototypes "; > > Modified: trunk/blender/intern/cycles/kernel/CMakeLists.txt > =================================================================== > --- trunk/blender/intern/cycles/kernel/CMakeLists.txt 2013-06-19 17:17:51 > UTC (rev 57579) > +++ trunk/blender/intern/cycles/kernel/CMakeLists.txt 2013-06-19 17:54:23 > UTC (rev 57580) > @@ -117,32 +117,68 @@ > # CUDA module > > if(WITH_CYCLES_CUDA_BINARIES) > + # 32 bit or 64 bit > if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") > set(CUDA_BITS 64) > else() > set(CUDA_BITS 32) > endif() > > + # CUDA version > + execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" > OUTPUT_VARIABLE NVCC_OUT) > + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" > CUDA_VERSION_MAJOR ${NVCC_OUT}) > + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" > CUDA_VERSION_MINOR ${NVCC_OUT}) > + set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") > + > + # build for each arch > set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} > ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) > set(cuda_cubins) > > foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) > set(cuda_cubin kernel_${arch}.cubin) > > - if(${arch} MATCHES "sm_1[0-9]") > - # sm_1x > - set(cuda_arch_flags "--maxrregcount=24 > --opencc-options -OPT:Olimit=0") > - elseif(${arch} MATCHES "sm_2[0-9]") > - # sm_2x > - set(cuda_arch_flags "--maxrregcount=24") > + set(cuda_version_flags > "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}") > + > + # warn for other versions > + if(CUDA_VERSION MATCHES "50") > else() > - # sm_3x > - set(cuda_arch_flags "--maxrregcount=32") > + message(STATUS "CUDA version > +${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, build may succeed > +but only CUDA 5.0 is officially supported") > endif() > + > + # build flags depending on CUDA version and arch > + if(CUDA_VERSION LESS 50) > + # CUDA 4.x > + if(${arch} MATCHES "sm_1[0-9]") > + # sm_1x > + set(cuda_arch_flags "--maxrregcount=24 > --opencc-options -OPT:Olimit=0") > + elseif(${arch} MATCHES "sm_2[0-9]") > + # sm_2x > + set(cuda_arch_flags "--maxrregcount=24") > + else() > + # sm_3x > + set(cuda_arch_flags "--maxrregcount=32") > + endif() > + > + set(cuda_math_flags "") > + else() > + # CUDA 5.x > + if(${arch} MATCHES "sm_1[0-9]") > + # sm_1x > + set(cuda_arch_flags "--maxrregcount=24 > --opencc-options -OPT:Olimit=0") > + elseif(${arch} MATCHES "sm_2[0-9]") > + # sm_2x > + set(cuda_arch_flags "--maxrregcount=32") > + else() > + # sm_3x > + set(cuda_arch_flags "--maxrregcount=32") > + endif() > + > + set(cuda_math_flags "--use_fast_math") > + endif() > > add_custom_command( > OUTPUT ${cuda_cubin} > - COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} > -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o > ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" > ${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util > -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= > -DCCL_NAMESPACE_END= -DNVCC > + COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} > -m${CUDA_BITS} --cubin > +${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o > +${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" > +${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} > +-I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm > +-DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC > DEPENDS ${cuda_sources}) > > delayed_install("${CMAKE_CURRENT_BINARY_DIR}" > "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) > > Modified: trunk/blender/intern/cycles/kernel/SConscript > =================================================================== > --- trunk/blender/intern/cycles/kernel/SConscript 2013-06-19 17:17:51 > UTC (rev 57579) > +++ trunk/blender/intern/cycles/kernel/SConscript 2013-06-19 17:54:23 > UTC (rev 57580) > @@ -25,6 +25,8 @@ > # > # ***** END GPL LICENSE BLOCK ***** > > +import re > +import subprocess > import sys > import os > import Blender as B > @@ -60,10 +62,19 @@ > svm_dir = os.path.join(source_dir, "../svm") > closure_dir = os.path.join(source_dir, "../closure") > > + # get CUDA version > + nvcc_pipe = subprocess.Popen([nvcc, > "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) > + output, erroroutput = nvcc_pipe.communicate() > + cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0] > + cuda_version = int(cuda_major_minor[0])*10 + > + int(cuda_major_minor[1]) > + > + if cuda_version != 50: > + print("CUDA version %d.%d detected, build may succeed but only > + CUDA 5.0 is officially supported." % (cuda_version/10, > + cuda_version%10)) > + > # nvcc flags > nvcc_flags = "-m%s" % (bits) > - nvcc_flags += " --cubin --ptxas-options=\"-v\" --maxrregcount=24" > - nvcc_flags += " --opencc-options -OPT:Olimit=0" > + nvcc_flags += " --cubin --ptxas-options=\"-v\"" > + nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version) > nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC" > nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, > closure_dir) > > @@ -75,8 +86,32 @@ > for arch in cuda_archs: > cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch) > > - command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, > nvcc_flags, kernel_file, cubin_file) > + # build flags depending on CUDA version and arch > + if cuda_version < 50: > + # CUDA 4.x > + if arch.startswith("sm_1"): > + # sm_1x > + cuda_arch_flags = "--maxrregcount=24 --opencc-options > -OPT:Olimit=0" > + elif arch.startswith("sm_2"): > + # sm_2x > + cuda_arch_flags = "--maxrregcount=24" > + else: > + # sm_3x > + cuda_arch_flags = "--maxrregcount=32" > + else: > + # CUDA 5.x > + if arch.startswith("sm_1"): > + # sm_1x > + cuda_arch_flags = "--maxrregcount=24 --opencc-options > -OPT:Olimit=0 --use_fast_math" > + elif arch.startswith("sm_2"): > + # sm_2x > + cuda_arch_flags = "--maxrregcount=32 --use_fast_math" > + else: > + # sm_3x > + cuda_arch_flags = "--maxrregcount=32 --use_fast_math" > > + command = "\"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (nvcc, > + arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file) > + > kernel.Command(cubin_file, 'kernel.cu', command) > kernel.Depends(cubin_file, dependencies) > > > Modified: trunk/blender/intern/cycles/kernel/kernel_jitter.h > =================================================================== > --- trunk/blender/intern/cycles/kernel/kernel_jitter.h 2013-06-19 17:17:51 > UTC (rev 57579) > +++ trunk/blender/intern/cycles/kernel/kernel_jitter.h 2013-06-19 17:54:23 > UTC (rev 57580) > @@ -137,7 +137,7 @@ > } > > #ifdef __CMJ__ > -__device_noinline float cmj_sample_1D(int s, int N, int p) > +__device float cmj_sample_1D(int s, int N, int p) > { > uint x = cmj_permute(s, N, p * 0x68bc21eb); > float jx = cmj_randfloat(s, p * 0x967a889b); @@ -146,7 +146,7 @@ > return (x + jx)*invN; > } > > -__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float > *fy) > +__device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) > { > int m = float_to_int(sqrtf(N)); > int n = (N + m - 1)/m; > > Modified: trunk/blender/intern/cycles/kernel/kernel_shader.h > =================================================================== > --- trunk/blender/intern/cycles/kernel/kernel_shader.h 2013-06-19 17:17:51 > UTC (rev 57579) > > @@ Diff output truncated at 10240 characters. @@ > _______________________________________________ > Bf-blender-cvs mailing list > [email protected] > http://lists.blender.org/mailman/listinfo/bf-blender-cvs > > _______________________________________________ > Bf-committers mailing list > [email protected] > http://lists.blender.org/mailman/listinfo/bf-committers _______________________________________________ Bf-committers mailing list [email protected] http://lists.blender.org/mailman/listinfo/bf-committers
