Hi Jürgen, I will check this later, have not tested this on Windows yet (and it is just in my branch so far). But I think it’s safe to add an AVX2 flag for msvc2013.
Thomas Am 13.05.2014 um 12:04 schrieb Jürgen Herrmann <[email protected]>: > Hi Thomas, > > What about VS2013 update 2? > It supports avx2 too... > > ----- Ursprüngliche Nachricht ----- > Von: "Thomas Dinges" <[email protected]> > Gesendet: 13.05.2014 10:38 > An: "[email protected]" <[email protected]> > Betreff: [Bf-blender-cvs] [ac908f6] soc-2014-cycles: Cycles: Add an AVX2 > CPUkernel. > > Commit: ac908f6c1f6d77790d2645104d4ba9a139937317 > Author: Thomas Dinges > Date: Tue May 13 10:37:01 2014 +0200 > https://developer.blender.org/rBac908f6c1f6d77790d2645104d4ba9a139937317 > > Cycles: Add an AVX2 CPU kernel. > > New optimized kernel, which requires AVX2 and FMA3 instruction sets. > At the moment the speedup is small (~2%) as we only use gcc and clang auto > optimization, but we can use dedicated intrinsics for that later. > > D482 would be a good basis for further improvements. > > =================================================================== > > M intern/cycles/CMakeLists.txt > M intern/cycles/SConscript > M intern/cycles/device/device_cpu.cpp > M intern/cycles/kernel/CMakeLists.txt > M intern/cycles/kernel/kernel.h > A intern/cycles/kernel/kernel_avx2.cpp > M intern/cycles/util/util_optimization.h > M intern/cycles/util/util_system.cpp > M intern/cycles/util/util_system.h > > =================================================================== > > diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt > index a1b0030..7a9739c 100644 > --- a/intern/cycles/CMakeLists.txt > +++ b/intern/cycles/CMakeLists.txt > @@ -20,8 +20,10 @@ if(WIN32 AND MSVC) > # /arch:AVX for VC2012 and above > if(NOT MSVC_VERSION LESS 1700) > set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX") > + set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX") > elseif(NOT CMAKE_CL_64) > set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2") > + set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2") > endif() > > # there is no /arch:SSE3, but intrinsics are available anyway > @@ -30,11 +32,13 @@ if(WIN32 AND MSVC) > set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} > /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") > else() > set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} > /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") > endif() > > set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast > -D_CRT_SECURE_NO_WARNINGS /GS-") > @@ -48,6 +52,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC) > set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3 -mfpmath=sse") > set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3 -msse4.1 -mfpmath=sse") > set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3 -msse4.1 -mavx -mfpmath=sse") > + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3 -msse4.1 -mavx -mavx2 -mfma -mfpmath=sse") > endif() > set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") > elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") > @@ -57,6 +62,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") > set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3") > set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3 -msse4.1") > set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3 -msse4.1 -mavx") > + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 > -mssse3 -msse4.1 -mavx -mavx2 —mfma") > endif() > set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") > endif() > @@ -67,6 +73,7 @@ if(CXX_HAS_SSE) > -DWITH_KERNEL_SSE3 > -DWITH_KERNEL_SSE41 > -DWITH_KERNEL_AVX > + -DWITH_KERNEL_AVX2 > ) > endif() > > diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript > index 532238b..2439e0a 100644 > --- a/intern/cycles/SConscript > +++ b/intern/cycles/SConscript > @@ -39,6 +39,7 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp')) > sources.remove(path.join('kernel', 'kernel_sse3.cpp')) > sources.remove(path.join('kernel', 'kernel_sse41.cpp')) > sources.remove(path.join('kernel', 'kernel_avx.cpp')) > +sources.remove(path.join('kernel', 'kernel_avx2.cpp')) > > incs = [] > defs = [] > @@ -98,6 +99,7 @@ elif env['OURPLATFORM'] == 'win64-vc': > if env['MSVC_VERSION'] in ('11.0', '12.0'): > kernel_flags['sse41'] = kernel_flags['sse3'] > kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX' > + kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX' > else: > # -mavx only available with relatively new gcc/clang > kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse' > @@ -106,6 +108,7 @@ else: > > if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or > (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'): > kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx' > + kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma' > > for kernel_type in kernel_flags.keys(): > defs.append('WITH_KERNEL_' + kernel_type.upper()) > diff --git a/intern/cycles/device/device_cpu.cpp > b/intern/cycles/device/device_cpu.cpp > index c9cc759..fa2a344 100644 > --- a/intern/cycles/device/device_cpu.cpp > +++ b/intern/cycles/device/device_cpu.cpp > @@ -62,6 +62,7 @@ public: > system_cpu_support_sse3(); > system_cpu_support_sse41(); > system_cpu_support_avx(); > + system_cpu_support_avx2(); > } > > ~CPUDevice() > @@ -167,6 +168,28 @@ public: > int start_sample = tile.start_sample; > int end_sample = tile.start_sample + tile.num_samples; > > +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 > + if(system_cpu_support_avx2()) { > + for(int sample = start_sample; sample < > end_sample; sample++) { > + if (task.get_cancel() || > task_pool.canceled()) { > + if(task.need_finish_queue == > false) > + break; > + } > + > + for(int y = tile.y; y < tile.y + > tile.h; y++) { > + for(int x = tile.x; x < tile.x > + tile.w; x++) { > + > kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state, > + > sample, x, y, tile.offset, tile.stride); > + } > + } > + > + tile.sample = sample + 1; > + > + task.update_progress(tile); > + } > + } > + else > +#endif > #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX > if(system_cpu_support_avx()) { > for(int sample = start_sample; sample < > end_sample; sample++) { > @@ -293,6 +316,15 @@ public: > float sample_scale = 1.0f/(task.sample + 1); > > if(task.rgba_half) { > +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 > + if(system_cpu_support_avx2()) { > + for(int y = task.y; y < task.y + task.h; y++) > + for(int x = task.x; x < task.x + > task.w; x++) > + > kernel_cpu_avx2_convert_to_half_float(&kernel_globals, > (uchar4*)task.rgba_half, (float*)task.buffer, > + > sample_scale, x, y, task.offset, > task.stride); > + } > + else > +#endif > #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX > if(system_cpu_support_avx()) { > for(int y = task.y; y < task.y + task.h; y++) > @@ -337,6 +369,15 @@ public: > } > } > else { > +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 > + if(system_cpu_support_avx2()) { > + for(int y = task.y; y < task.y + task.h; y++) > + for(int x = task.x; x < task.x + > task.w; x++) > + > kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, > (float*)task.buffer, > + > sample_scale, x, y, task.offset, task.stride); > + } > + else > +#endif > #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX > if(system_cpu_support_avx()) { > for(int y = task.y; y < task.y + task.h; y++) > @@ -390,6 +431,17 @@ public: > OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); > #endif > > +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 > + if(system_cpu_support_avx2()) { > + for(int x = task.shader_x; x < task.shader_x + > task.shader_w; x++) { > + kernel_cpu_avx2_shader(&kg, > (uint4*)task.shader_input, (float4*)task.shader_output, > task.shader_eval_type, x); > + > + if(task.get_cancel() || task_pool.canceled()) > + break; > + } > + } > + else > +#endif > #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX > if(system_cpu_support_avx()) { > for(int x = task.shader_x; x < task.shader_x + > task.shader_w; x++) { > diff --git a/intern/cycles/kernel/CMakeLists.txt > b/intern/cycles/kernel/CMakeLists.txt > index d18f4fa..ce82720 100644 > --- a/intern/cycles/kernel/CMakeLists.txt > +++ b/intern/cycles/kernel/CMakeLists.txt > @@ -213,12 +213,14 @@ if(CXX_HAS_SSE) > kernel_sse3.cpp > kernel_sse41.cpp > kernel_avx.cpp > + kernel_avx2.cpp > ) > > set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS > "${CYCLES_SSE2_KERNEL_FLAGS}") > set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS > "${CYCLES_SSE3_KERNEL_FLAGS}") > set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS > "${CYCLES_SSE41_KERNEL_FLAGS}") > set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS > "${CYCLES_AVX_KERNEL_FLAGS}") > + set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS > "${CYCLES_AVX2_KERNEL_FLAGS}") > endif() > > > diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h > index c4a0864..902b335 100644 > --- a/intern/cycles/kernel/kernel.h > +++ b/intern/cycles/kernel/kernel.h > @@ -87,6 +87,17 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 > *input, float4 *output, > int type, int i); > #endif > > +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 > +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned > int *rng_state, > + int sample, int x, int y, int offset, int stride); > +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float > *buffer, > + float sample_scale, int x, int y, int offset, int stride); > +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, > float *buffer, > + float sample_scale, int x, int y, int offset, int stride); > +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, > + int type, int i); > +#endif > + > CCL_NAMESPACE_END > > #endif /* __KERNEL_H__ */ > diff --git a/intern/cycles/kernel/kernel_avx2.cpp > b/intern/cycles/kernel/kernel_avx2.cpp > new file mode 100644 > index 0000000..2cbad99 > --- /dev/null > +++ b/intern/cycles/kernel/kernel_avx2.cpp > @@ -0,0 +1,82 @@ > +/* > + * Copyright 2011-2014 Blender Foundation > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License > + */ > + > +/* Optimized CPU kernel entry points. This file is compiled with AVX2 > + * optimization flags and nearly all functions inlined, while kernel.cpp > + * is compiled without for other CPU's. */ > + > +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ > +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) > +#define __KERNEL_SSE2__ > +#define __KERNEL_SSE3__ > +#define __KERNEL_SSSE3__ > +#define __KERNEL_SSE41__ > +#endif > + > +#include "util_optimization.h" > + > +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 > + > +#include "kernel.h" > +#include "kernel_compat_cpu.h" > +#include "kernel_m > > @@ Diff output truncated at 10240 characters. @@ > > _______________________________________________ > Bf-blender-cvs mailing list > [email protected] > http://lists.blender.org/mailman/listinfo/bf-blender-cvs > _______________________________________________ > Bf-committers mailing list > [email protected] > http://lists.blender.org/mailman/listinfo/bf-committers _______________________________________________ Bf-committers mailing list [email protected] http://lists.blender.org/mailman/listinfo/bf-committers
