[Bf-blender-cvs] [891b9730295] master: Functions: optimize multi-function evaluation in materialized mode

Jacques Lucke Sun, 08 Jan 2023 08:22:54 -0800

Commit: 891b97302957dc4df3db6182c4121e39a0775e0b
Author: Jacques Lucke
Date:   Sun Jan 8 17:19:57 2023 +0100
Branches: master
https://developer.blender.org/rB891b97302957dc4df3db6182c4121e39a0775e0b


Functions: optimize multi-function evaluation in materialized mode

This allows auto-vectorization to happen when the a multi-function is
evaluated in "materialized" mode, i.e. it is processed in chunks where
all input and outputs values are stored in contiguous arrays.

It also unifies the handling input, mutable and output parameters a bit.
Now they all can use tempory buffers in the same way.

===================================================================

M       source/blender/functions/FN_multi_function_builder.hh

===================================================================

diff --git a/source/blender/functions/FN_multi_function_builder.hh 
b/source/blender/functions/FN_multi_function_builder.hh
index 04849bcc221..bfe88e01f04 100644
--- a/source/blender/functions/FN_multi_function_builder.hh
+++ b/source/blender/functions/FN_multi_function_builder.hh
@@ -162,35 +162,24 @@ enum class MaterializeArgMode {
 
 template<typename ParamTag> struct MaterializeArgInfo {
   MaterializeArgMode mode = MaterializeArgMode::Unknown;
-  Span<typename ParamTag::base_type> internal_span;
+  const typename ParamTag::base_type *internal_span_data;
 };
 
 /**
- * Similar to #execute_array but accepts two mask inputs, one for inputs and 
one for outputs.
+ * Similar to #execute_array but is only used with arrays and does not need a 
mask.
  */
 template<typename... ParamTags, typename ElementFn, typename... Chunks>
-inline void execute_materialized_impl(TypeSequence<ParamTags...> 
/*param_tags*/,
-                                      const ElementFn element_fn,
-                                      const IndexRange in_mask,
-                                      const IndexMask out_mask,
-                                      Chunks &&__restrict... chunks)
+#if (defined(__GNUC__) && !defined(__clang__))
+[[gnu::optimize("-funroll-loops")]] [[gnu::optimize("O3")]]
+#endif
+inline void
+execute_materialized_impl(TypeSequence<ParamTags...> /*param_tags*/,
+                          const ElementFn element_fn,
+                          const int64_t size,
+                          Chunks &&__restrict... chunks)
 {
-  BLI_assert(in_mask.size() == out_mask.size());
-  for (const int64_t i : IndexRange(in_mask.size())) {
-    const int64_t in_i = in_mask[i];
-    const int64_t out_i = out_mask[i];
-    element_fn([&]() -> decltype(auto) {
-      using ParamTag = ParamTags;
-      if constexpr (ParamTag::category == ParamCategory::SingleInput) {
-        return chunks[in_i];
-      }
-      else if constexpr (ParamTag::category == ParamCategory::SingleOutput) {
-        return chunks[out_i];
-      }
-      else if constexpr (ParamTag::category == ParamCategory::SingleMutable) {
-        return chunks[out_i];
-      }
-    }()...);
+  for (int64_t i = 0; i < size; i++) {
+    element_fn(chunks[i]...);
   }
 }
 
@@ -211,15 +200,12 @@ inline void 
execute_materialized(TypeSequence<ParamTags...> /* param_tags */,
    * that large temporary arrays are needed. Using small chunks allows using 
small arrays, which
    * are reused multiple times, which improves cache efficiency. The chunk 
size also shouldn't be
    * too small, because then overhead of the outer loop over chunks becomes 
significant again. */
-  static constexpr int64_t MaxChunkSize = 32;
+  static constexpr int64_t MaxChunkSize = 64;
   const int64_t mask_size = mask.size();
-  const int64_t buffer_size = std::min(mask_size, MaxChunkSize);
-
-  /* Local buffers that are used to temporarily store values retrieved from 
virtual arrays. */
-  std::tuple<TypedBuffer<typename ParamTags::base_type, MaxChunkSize>...> 
buffers_owner;
+  const int64_t tmp_buffer_size = std::min(mask_size, MaxChunkSize);
 
-  /* A span for each parameter which is either empty or points to memory in 
#buffers_owner. */
-  std::tuple<MutableSpan<typename ParamTags::base_type>...> buffers;
+  /* Local buffers that are used to temporarily store values for processing. */
+  std::tuple<TypedBuffer<typename ParamTags::base_type, MaxChunkSize>...> 
temporary_buffers;
 
   /* Information about every parameter. */
   std::tuple<MaterializeArgInfo<ParamTags>...> args_info;
@@ -237,16 +223,17 @@ inline void 
execute_materialized(TypeSequence<ParamTags...> /* param_tags */,
           if (common_info.type == CommonVArrayInfo::Type::Single) {
             /* If an input #VArray is a single value, we have to fill the 
buffer with that value
              * only once. The same unchanged buffer can then be reused in 
every chunk. */
-            MutableSpan<T> in_chunk{std::get<I>(buffers_owner).ptr(), 
buffer_size};
             const T &in_single = *static_cast<const T *>(common_info.data);
-            uninitialized_fill_n(in_chunk.data(), in_chunk.size(), in_single);
-            std::get<I>(buffers) = in_chunk;
+            T *tmp_buffer = std::get<I>(temporary_buffers).ptr();
+            uninitialized_fill_n(tmp_buffer, tmp_buffer_size, in_single);
             arg_info.mode = MaterializeArgMode::Single;
           }
           else if (common_info.type == CommonVArrayInfo::Type::Span) {
             /* Remember the span so that it doesn't have to be retrieved in 
every iteration. */
-            const T *ptr = static_cast<const T *>(common_info.data);
-            arg_info.internal_span = Span<T>(ptr, varray_impl.size());
+            arg_info.internal_span_data = static_cast<const T 
*>(common_info.data);
+          }
+          else {
+            arg_info.internal_span_data = nullptr;
           }
         }
       }(),
@@ -254,56 +241,98 @@ inline void 
execute_materialized(TypeSequence<ParamTags...> /* param_tags */,
 
   /* Outer loop over all chunks. */
   for (int64_t chunk_start = 0; chunk_start < mask_size; chunk_start += 
MaxChunkSize) {
-    const IndexMask sliced_mask = mask.slice_safe(chunk_start, MaxChunkSize);
-    const int64_t chunk_size = sliced_mask.size();
+    const int64_t chunk_end = std::min<int64_t>(chunk_start + MaxChunkSize, 
mask_size);
+    const int64_t chunk_size = chunk_end - chunk_start;
+    const IndexMask sliced_mask = mask.slice(chunk_start, chunk_size);
+    const int64_t mask_start = sliced_mask[0];
     const bool sliced_mask_is_range = sliced_mask.is_range();
 
+    /* Move mutable data into temporary array. */
+    if (!sliced_mask_is_range) {
+      (
+          [&] {
+            /* Use `typedef` instead of `using` to work around a compiler bug. 
*/
+            typedef ParamTags ParamTag;
+            typedef typename ParamTag::base_type T;
+            if constexpr (ParamTag::category == ParamCategory::SingleMutable) {
+              T *tmp_buffer = std::get<I>(temporary_buffers).ptr();
+              T *param_buffer = std::get<I>(loaded_params);
+              for (int64_t i = 0; i < chunk_size; i++) {
+                new (tmp_buffer + i) 
T(std::move(param_buffer[sliced_mask[i]]));
+              }
+            }
+          }(),
+          ...);
+    }
+
     execute_materialized_impl(
         TypeSequence<ParamTags...>(),
         element_fn,
-        /* Inputs are "compressed" into contiguous arrays without gaps. */
-        IndexRange(chunk_size),
-        /* Outputs are written directly into the correct place in the output 
arrays. */
-        sliced_mask,
+        chunk_size,
         /* Prepare every parameter for this chunk. */
         [&] {
           using ParamTag = ParamTags;
           using T = typename ParamTag::base_type;
           [[maybe_unused]] MaterializeArgInfo<ParamTags> &arg_info = 
std::get<I>(args_info);
+          T *tmp_buffer = std::get<I>(temporary_buffers);
           if constexpr (ParamTag::category == ParamCategory::SingleInput) {
             if (arg_info.mode == MaterializeArgMode::Single) {
               /* The single value has been filled into a buffer already reused 
for every chunk. */
-              return Span<T>(std::get<I>(buffers));
+              return const_cast<const T *>(tmp_buffer);
             }
-            else {
-              if (sliced_mask_is_range) {
-                if (!arg_info.internal_span.is_empty()) {
-                  /* In this case we can just use an existing span instead of 
"compressing" it into
-                   * a new temporary buffer. */
-                  const IndexRange sliced_mask_range = sliced_mask.as_range();
-                  arg_info.mode = MaterializeArgMode::Span;
-                  return arg_info.internal_span.slice(sliced_mask_range);
-                }
-              }
-              const GVArrayImpl &varray_impl = *std::get<I>(loaded_params);
-              /* As a fallback, do a virtual function call to retrieve all 
elements in the current
-               * chunk. The elements are stored in a temporary buffer reused 
for every chunk. */
-              MutableSpan<T> in_chunk{std::get<I>(buffers_owner).ptr(), 
chunk_size};
-              varray_impl.materialize_compressed_to_uninitialized(sliced_mask, 
in_chunk.data());
-              /* Remember that this parameter has been materialized, so that 
the values are
-               * destructed properly when the chunk is done. */
-              arg_info.mode = MaterializeArgMode::Materialized;
-              return Span<T>(in_chunk);
+            if (sliced_mask_is_range && arg_info.internal_span_data != 
nullptr) {
+              /* In this case we can just use an existing span instead of 
"compressing" it into
+               * a new temporary buffer. */
+              arg_info.mode = MaterializeArgMode::Span;
+              return arg_info.internal_span_data + mask_start;
             }
+            const GVArrayImpl &varray_impl = *std::get<I>(loaded_params);
+            /* As a fallback, do a virtual function call to retrieve all 
elements in the current
+             * chunk. The elements are stored in a temporary buffer reused for 
every chunk. */
+            varray_impl.materialize_compressed_to_uninitialized(sliced_mask, 
tmp_buffer);
+            /* Remember that this parameter has been materialized, so that the 
values are
+             * destructed properly when the chunk is done. */
+            arg_info.mode = MaterializeArgMode::Materialized;
+            return const_cast<const T *>(tmp_buffer);
           }
           else if constexpr (ELEM(ParamTag::category,
                                   ParamCategory::SingleOutput,
                                   ParamCategory::SingleMutable)) {
             /* For outputs, just pass a pointer. This is important so that 
`__restrict` works. */
-            return std::get<I>(loaded_params);
+            if (sliced_mask_is_range) {
+              /* Can write into the caller-provided buffer directly. */
+              T *param_buffer = std::get<I>(loaded_params);
+              return param_buffer + mask_start;
+            }
+            else {
+              /* Use the temporary buffer. The va

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
List details, subscription details or unsubscribe:
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [891b9730295] master: Functions: optimize multi-function evaluation in materialized mode

Reply via email to