Hi Thomas, What about VS2013 update 2? It supports avx2 too...
----- Ursprüngliche Nachricht ----- Von: "Thomas Dinges" <[email protected]> Gesendet: 13.05.2014 10:38 An: "[email protected]" <[email protected]> Betreff: [Bf-blender-cvs] [ac908f6] soc-2014-cycles: Cycles: Add an AVX2 CPUkernel. Commit: ac908f6c1f6d77790d2645104d4ba9a139937317 Author: Thomas Dinges Date: Tue May 13 10:37:01 2014 +0200 https://developer.blender.org/rBac908f6c1f6d77790d2645104d4ba9a139937317 Cycles: Add an AVX2 CPU kernel. New optimized kernel, which requires AVX2 and FMA3 instruction sets. At the moment the speedup is small (~2%) as we only use gcc and clang auto optimization, but we can use dedicated intrinsics for that later. D482 would be a good basis for further improvements. =================================================================== M intern/cycles/CMakeLists.txt M intern/cycles/SConscript M intern/cycles/device/device_cpu.cpp M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/kernel.h A intern/cycles/kernel/kernel_avx2.cpp M intern/cycles/util/util_optimization.h M intern/cycles/util/util_system.cpp M intern/cycles/util/util_system.h =================================================================== diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index a1b0030..7a9739c 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -20,8 +20,10 @@ if(WIN32 AND MSVC) # /arch:AVX for VC2012 and above if(NOT MSVC_VERSION LESS 1700) set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX") + set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX") elseif(NOT CMAKE_CL_64) set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2") + set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2") endif() # there is no /arch:SSE3, but intrinsics are available anyway @@ -30,11 +32,13 @@ if(WIN32 AND MSVC) set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") else() set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") @@ -48,6 +52,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC) set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mfpmath=sse") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") @@ -57,6 +62,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 —mfma") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() @@ -67,6 +73,7 @@ if(CXX_HAS_SSE) -DWITH_KERNEL_SSE3 -DWITH_KERNEL_SSE41 -DWITH_KERNEL_AVX + -DWITH_KERNEL_AVX2 ) endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index 532238b..2439e0a 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -39,6 +39,7 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp')) sources.remove(path.join('kernel', 'kernel_sse3.cpp')) sources.remove(path.join('kernel', 'kernel_sse41.cpp')) sources.remove(path.join('kernel', 'kernel_avx.cpp')) +sources.remove(path.join('kernel', 'kernel_avx2.cpp')) incs = [] defs = [] @@ -98,6 +99,7 @@ elif env['OURPLATFORM'] == 'win64-vc': if env['MSVC_VERSION'] in ('11.0', '12.0'): kernel_flags['sse41'] = kernel_flags['sse3'] kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX' + kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX' else: # -mavx only available with relatively new gcc/clang kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse' @@ -106,6 +108,7 @@ else: if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'): kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx' + kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma' for kernel_type in kernel_flags.keys(): defs.append('WITH_KERNEL_' + kernel_type.upper()) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index c9cc759..fa2a344 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -62,6 +62,7 @@ public: system_cpu_support_sse3(); system_cpu_support_sse41(); system_cpu_support_avx(); + system_cpu_support_avx2(); } ~CPUDevice() @@ -167,6 +168,28 @@ public: int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(tile); + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int sample = start_sample; sample < end_sample; sample++) { @@ -293,6 +316,15 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int y = task.y; y < task.y + task.h; y++) @@ -337,6 +369,15 @@ public: } } else { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int y = task.y; y < task.y + task.h; y++) @@ -390,6 +431,17 @@ public: OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { + kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + + if(task.get_cancel() || task_pool.canceled()) + break; + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index d18f4fa..ce82720 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -213,12 +213,14 @@ if(CXX_HAS_SSE) kernel_sse3.cpp kernel_sse41.cpp kernel_avx.cpp + kernel_avx2.cpp ) set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index c4a0864..902b335 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -87,6 +87,17 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, + int type, int i); +#endif + CCL_NAMESPACE_END #endif /* __KERNEL_H__ */ diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp new file mode 100644 index 0000000..2cbad99 --- /dev/null +++ b/intern/cycles/kernel/kernel_avx2.cpp @@ -0,0 +1,82 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +#define __KERNEL_SSE2__ +#define __KERNEL_SSE3__ +#define __KERNEL_SSSE3__ +#define __KERNEL_SSE41__ +#endif + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_m @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] http://lists.blender.org/mailman/listinfo/bf-blender-cvs _______________________________________________ Bf-committers mailing list [email protected] http://lists.blender.org/mailman/listinfo/bf-committers
