This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 3a9d9bc716 GH-49410: [C++] Fix if_else null-scalar fast paths for 
sliced BaseBinary arrays (#49443)
3a9d9bc716 is described below

commit 3a9d9bc716afce97a4b7c802e2f5892cdd3bda72
Author: Ebraam <[email protected]>
AuthorDate: Thu Mar 12 13:41:48 2026 +0200

    GH-49410: [C++] Fix if_else null-scalar fast paths for sliced BaseBinary 
arrays (#49443)
    
    ### Rationale for this change
    `if_else` with a null scalar and a sliced `BaseBinary` array (`offsets[0] 
!= 0`) produces invalid output. The ASA and AAS shortcut paths in 
`scalar_if_else.cc` copy offsets without adjusting for the slice offset, and 
copy data from byte 0 instead of `data + offsets[0]`.
    
    ### What changes are included in this PR?
    
    **Fix** (`scalar_if_else.cc`): In both the ASA and AAS fast paths:
    - Replace the single `memcpy` for the offsets buffer with a loop that 
normalizes each offset by subtracting `offsets[0]` as a base, so output offsets 
always start at 0.
    - Change the data copy to start from `data + base` instead of `data`, so 
only the bytes belonging to the slice are copied.
    
    **Regression tests (`scalar_if_else_test.cc`):**
    - `IfElseBaseBinarySliced`: covers sliced arrays (`array.offset != 0`) for 
both ASA and AAS paths.
    - `IfElseBaseBinaryNonZeroFirst`: covers the edge case where `array.offset 
== 0` but `offsets[0] != 0`, manually constructed via `ArrayData::Make()`.
    
    ### Are these changes tested?
    
    Yes. The new `IfElseBaseBinarySliced` typed test reproduces the bug and 
passes after the fix. All 416 existing tests continue to pass.
    
    ### Are there any user-facing changes?
    
    Yes. The bug causes incorrect/invalid data to be produced when `if_else` is 
called with a null scalar and a sliced `BaseBinary` array.
    
    * GitHub Issue: #49410
    
    Lead-authored-by: Ebraam <[email protected]>
    Co-authored-by: Ebraam <[email protected]>
    Co-authored-by: Ibraam-Ashraf 
<[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/compute/kernels/scalar_if_else.cc    | 30 ++++++++++--
 .../arrow/compute/kernels/scalar_if_else_test.cc   | 53 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc 
b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index ed32ec203c..0c5cfe1c90 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -759,11 +759,22 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
       auto* out_data = out->array_data().get();
       auto offset_length = (cond.length + 1) * sizeof(OffsetType);
       ARROW_ASSIGN_OR_RAISE(out_data->buffers[1], 
ctx->Allocate(offset_length));
-      std::memcpy(out_data->buffers[1]->mutable_data(), right_offsets, 
offset_length);
+
+      if (right_offsets[0] == 0) {
+        std::memcpy(out_data->buffers[1]->mutable_data(), right_offsets, 
offset_length);
+      } else {
+        OffsetType base = right_offsets[0];
+        auto* out_offsets =
+            
reinterpret_cast<OffsetType*>(out_data->buffers[1]->mutable_data());
+        for (int64_t i = 0; i <= cond.length; ++i) {
+          out_offsets[i] = right_offsets[i] - base;
+        }
+      }
 
       auto right_data_length = right_offsets[right.length] - right_offsets[0];
       ARROW_ASSIGN_OR_RAISE(out_data->buffers[2], 
ctx->Allocate(right_data_length));
-      std::memcpy(out_data->buffers[2]->mutable_data(), right_data, 
right_data_length);
+      std::memcpy(out_data->buffers[2]->mutable_data(), right_data + 
right_offsets[0],
+                  right_data_length);
       return Status::OK();
     }
 
@@ -801,11 +812,22 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
       auto* out_data = out->array_data().get();
       auto offset_length = (cond.length + 1) * sizeof(OffsetType);
       ARROW_ASSIGN_OR_RAISE(out_data->buffers[1], 
ctx->Allocate(offset_length));
-      std::memcpy(out_data->buffers[1]->mutable_data(), left_offsets, 
offset_length);
+
+      if (left_offsets[0] == 0) {
+        std::memcpy(out_data->buffers[1]->mutable_data(), left_offsets, 
offset_length);
+      } else {
+        OffsetType base = left_offsets[0];
+        auto* out_offsets =
+            
reinterpret_cast<OffsetType*>(out_data->buffers[1]->mutable_data());
+        for (int64_t i = 0; i <= cond.length; ++i) {
+          out_offsets[i] = left_offsets[i] - base;
+        }
+      }
 
       auto left_data_length = left_offsets[left.length] - left_offsets[0];
       ARROW_ASSIGN_OR_RAISE(out_data->buffers[2], 
ctx->Allocate(left_data_length));
-      std::memcpy(out_data->buffers[2]->mutable_data(), left_data, 
left_data_length);
+      std::memcpy(out_data->buffers[2]->mutable_data(), left_data + 
left_offsets[0],
+                  left_data_length);
       return Status::OK();
     }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index e5cf73742b..6fdcff8d97 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -27,6 +27,7 @@
 #include "arrow/compute/kernels/test_util_internal.h"
 #include "arrow/compute/registry.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/bitmap_builders.h"
 #include "arrow/util/checked_cast.h"
 
 namespace arrow {
@@ -609,6 +610,58 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
   CheckIfElseOutput(cond, left, right, expected_data);
 }
 
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinarySliced) {
+  auto type = TypeTraits<TypeParam>::type_singleton();
+
+  auto full_arr = ArrayFromJSON(type, R"(["not used", null, "x", "x"])");
+  auto sliced = full_arr->Slice(1);
+  auto expected = ArrayFromJSON(type, R"([null, "x", "x"])");
+
+  auto cond_asa = ArrayFromJSON(boolean(), "[true, false, false]");
+  ASSERT_OK_AND_ASSIGN(auto result_asa,
+                       CallFunction("if_else", {cond_asa, 
MakeNullScalar(type), sliced}));
+  ASSERT_OK(result_asa.make_array()->ValidateFull());
+  AssertArraysEqual(*expected, *result_asa.make_array(), true);
+
+  auto cond_aas = ArrayFromJSON(boolean(), "[false, true, true]");
+  ASSERT_OK_AND_ASSIGN(auto result_aas,
+                       CallFunction("if_else", {cond_aas, sliced, 
MakeNullScalar(type)}));
+  ASSERT_OK(result_aas.make_array()->ValidateFull());
+  AssertArraysEqual(*expected, *result_aas.make_array(), true);
+}
+
+// array offset=0 but offsets[0] != 0
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryNonZeroFirst) {
+  auto type = TypeTraits<TypeParam>::type_singleton();
+  using OffsetType = typename TypeTraits<TypeParam>::OffsetType::c_type;
+
+  std::vector<OffsetType> raw_offsets = {8, 8, 9, 10};
+  std::string raw_data(8, 'p');
+  raw_data += "ab";
+  auto offsets_buf = Buffer::Wrap(raw_offsets.data(), raw_offsets.size());
+  auto data_buf = Buffer::Wrap(raw_data.data(), raw_data.size());
+  auto array_data = ArrayData::Make(type, /*length=*/3, {nullptr, offsets_buf, 
data_buf},
+                                    /*null_count=*/1, /*offset=*/0);
+  std::vector<uint8_t> validity_bytes = {0, 1, 1};
+  ASSERT_OK_AND_ASSIGN(array_data->buffers[0],
+                       internal::BytesToBits(validity_bytes, 
default_memory_pool()));
+  auto arr = MakeArray(array_data);
+  ASSERT_OK(arr->ValidateFull());
+  auto expected = ArrayFromJSON(type, R"([null, "a", "b"])");
+
+  auto cond_asa = ArrayFromJSON(boolean(), "[true, false, false]");
+  ASSERT_OK_AND_ASSIGN(auto result_asa,
+                       CallFunction("if_else", {cond_asa, 
MakeNullScalar(type), arr}));
+  ASSERT_OK(result_asa.make_array()->ValidateFull());
+  AssertArraysEqual(*expected, *result_asa.make_array(), true);
+
+  auto cond_aas = ArrayFromJSON(boolean(), "[false, true, true]");
+  ASSERT_OK_AND_ASSIGN(auto result_aas,
+                       CallFunction("if_else", {cond_aas, arr, 
MakeNullScalar(type)}));
+  ASSERT_OK(result_aas.make_array()->ValidateFull());
+  AssertArraysEqual(*expected, *result_aas.make_array(), true);
+}
+
 Result<std::shared_ptr<Array>> MakeBinaryArrayWithData(
     const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& 
data_buffer) {
   // Make a (large-)binary array with a single item backed by the given data

Reply via email to