[arrow] branch main updated: GH-38074: [C++] Fix Offset Size Calculation for Slicing Large String and Binary Types in Hash Join (#38147)

apitrou Mon, 16 Oct 2023 01:57:36 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new fb26178332 GH-38074: [C++] Fix Offset Size Calculation for Slicing 
Large String and Binary Types in Hash Join (#38147)
fb26178332 is described below

commit fb2617833200d9c90b6d79a85ae9e840b41938da
Author: Hyunseok Seo <[email protected]>
AuthorDate: Mon Oct 16 17:57:09 2023 +0900

    GH-38074: [C++] Fix Offset Size Calculation for Slicing Large String and 
Binary Types in Hash Join (#38147)
    
    
    
    ### Rationale for this change
    
    We found that the wrong results in inner joins during hash join operations 
were caused by a problem with how large strings and binary types were handled. 
The `Slice` function was not calculating their sizes correctly.
    
    To fix this, I changed the `Slice` function to calculate the sizes 
correctly, based on the type of data for large string and binary.
    
    * Issue raised: #37729
    
    ### What changes are included in this PR?
    
    * The `Slice` function has been updated to correctly calculate the offset 
for Large String and Large Binary types, and assertion statements have been 
added to improve maintainability.
    * Unit tests (`TEST(KeyColumnArray, SliceBinaryTest)`)for the Slice 
function have been added.
    * During random tests for Hash Join (`TEST(HashJoin, Random)`), 
modifications were made to allow the creation of Large String as key column 
values.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    Acero might not have a large user base as it is an experimental feature, 
but I deemed the issue of incorrect join results as critical and have addressed 
the bug.
    
    * Closes: #38074
    
    Authored-by: Hyunseok Seo <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/acero/hash_join_node_test.cc | 27 ++++++++++++--
 cpp/src/arrow/compute/light_array.cc       | 12 ++++--
 cpp/src/arrow/compute/light_array.h        | 25 +++++++++++++
 cpp/src/arrow/compute/light_array_test.cc  | 60 ++++++++++++++++++++++++++++++
 4 files changed, 116 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc 
b/cpp/src/arrow/acero/hash_join_node_test.cc
index 960d3838fb..58551f4eca 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -253,7 +253,8 @@ struct RandomDataTypeConstraints {
   int max_string_length;
 
   void Default() {
-    data_type_enabled_mask = kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | 
kString;
+    data_type_enabled_mask =
+        kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString | 
kLargeString;
     min_null_probability = 0.0;
     max_null_probability = 0.2;
     min_binary_length = 1;
@@ -289,6 +290,7 @@ struct RandomDataTypeConstraints {
   static constexpr int64_t kBool = 16;
   static constexpr int64_t kBinary = 32;
   static constexpr int64_t kString = 64;
+  static constexpr int64_t kLargeString = 128;
 };
 
 struct RandomDataType {
@@ -297,6 +299,7 @@ struct RandomDataType {
   int fixed_length;
   int min_string_length;
   int max_string_length;
+  bool is_large_string;
 
   static RandomDataType Random(Random64Bit& rng,
                                const RandomDataTypeConstraints& constraints) {
@@ -312,6 +315,15 @@ struct RandomDataType {
     } else {
       result.is_fixed_length = true;
     }
+
+    if (!result.is_fixed_length &&
+        (constraints.data_type_enabled_mask & constraints.kLargeString) != 0) {
+      // When selecting the string type, there's a 50% chance of choosing a 
large string.
+      result.is_large_string = ((rng.next() % 2) == 0);
+    } else {
+      result.is_large_string = false;
+    }
+
     if (constraints.max_null_probability > 0.0) {
       // 25% chance of no nulls
       // Uniform distribution of null probability from min to max
@@ -405,9 +417,16 @@ std::vector<std::shared_ptr<Array>> GenRandomRecords(
           break;
       }
     } else {
-      result.push_back(rag.String(num_rows, data_types[i].min_string_length,
-                                  data_types[i].max_string_length,
-                                  data_types[i].null_probability));
+      if (data_types[i].is_large_string) {
+        // Generate LargeString if is_large_string flag is true
+        result.push_back(rag.LargeString(num_rows, 
data_types[i].min_string_length,
+                                         data_types[i].max_string_length,
+                                         data_types[i].null_probability));
+      } else {
+        result.push_back(rag.String(num_rows, data_types[i].min_string_length,
+                                    data_types[i].max_string_length,
+                                    data_types[i].null_probability));
+      }
     }
   }
   return result;
diff --git a/cpp/src/arrow/compute/light_array.cc 
b/cpp/src/arrow/compute/light_array.cc
index 37d4421fd7..4e8b2b2d7c 100644
--- a/cpp/src/arrow/compute/light_array.cc
+++ b/cpp/src/arrow/compute/light_array.cc
@@ -80,8 +80,7 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t 
length) const {
   KeyColumnArray sliced;
   sliced.metadata_ = metadata_;
   sliced.length_ = length;
-  uint32_t fixed_size =
-      !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
+  uint32_t fixed_size = metadata_.fixed_length;
 
   sliced.buffers_[0] =
       buffers_[0] ? buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr;
@@ -89,18 +88,23 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, 
int64_t length) const {
       mutable_buffers_[0] ? mutable_buffers_[0] + (bit_offset_[0] + offset) / 
8 : nullptr;
   sliced.bit_offset_[0] = (bit_offset_[0] + offset) % 8;
 
-  if (fixed_size == 0 && !metadata_.is_null_type) {
+  if (metadata_.fixed_length == 0 && !metadata_.is_null_type) {
+    ARROW_DCHECK(is_bool_type()) << "Expected BOOL type type but got a 
different type.";
     sliced.buffers_[1] =
         buffers_[1] ? buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr;
     sliced.mutable_buffers_[1] = mutable_buffers_[1]
                                      ? mutable_buffers_[1] + (bit_offset_[1] + 
offset) / 8
                                      : nullptr;
     sliced.bit_offset_[1] = (bit_offset_[1] + offset) % 8;
-  } else {
+  } else if (metadata_.fixed_length > 0) {
+    ARROW_DCHECK(is_binary_type() || is_large_binary_type() || 
is_fixed_width_types())
+        << "Expected (LARGE) BINARY or FIXED WIDTH type but got a different 
type.";
     sliced.buffers_[1] = buffers_[1] ? buffers_[1] + offset * fixed_size : 
nullptr;
     sliced.mutable_buffers_[1] =
         mutable_buffers_[1] ? mutable_buffers_[1] + offset * fixed_size : 
nullptr;
     sliced.bit_offset_[1] = 0;
+  } else {
+    ARROW_DCHECK(is_null_type()) << "Expected Null type but got a different 
type.";
   }
 
   sliced.buffers_[2] = buffers_[2];
diff --git a/cpp/src/arrow/compute/light_array.h 
b/cpp/src/arrow/compute/light_array.h
index 3dd328c22e..87f6b6c76a 100644
--- a/cpp/src/arrow/compute/light_array.h
+++ b/cpp/src/arrow/compute/light_array.h
@@ -183,6 +183,31 @@ class ARROW_EXPORT KeyColumnArray {
   // Starting bit offset within the first byte (between 0 and 7)
   // to be used when accessing buffers that store bit vectors.
   int bit_offset_[kMaxBuffers - 1];
+
+  bool is_bool_type() const {
+    return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_fixed_width_types() const {
+    return metadata_.is_fixed_length && metadata_.fixed_length != 0 &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_binary_type() const {
+    return !metadata_.is_fixed_length && metadata_.fixed_length == 
sizeof(uint32_t) &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_large_binary_type() const {
+    return !metadata_.is_fixed_length && metadata_.fixed_length == 
sizeof(uint64_t) &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_null_type() const {
+    return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
+           metadata_.is_null_type;
+  }
 };
 
 /// \brief Create KeyColumnMetadata from a DataType
diff --git a/cpp/src/arrow/compute/light_array_test.cc 
b/cpp/src/arrow/compute/light_array_test.cc
index 71d228bf3f..4e33f7b578 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -226,6 +226,66 @@ TEST(KeyColumnArray, SliceBool) {
   }
 }
 
+struct SliceTestCase {
+  int offset;
+  int length;
+  std::vector<std::string> expected;
+};
+
+template <typename OffsetType>
+void GenericTestSlice(const std::shared_ptr<DataType>& type, const char* 
json_data,
+                      const std::vector<SliceTestCase>& testCases) {
+  auto array = ArrayFromJSON(type, json_data);
+  KeyColumnArray kc_array =
+      ColumnArrayFromArrayData(array->data(), 0, array->length()).ValueOrDie();
+
+  for (const auto& testCase : testCases) {
+    ARROW_SCOPED_TRACE("Offset: ", testCase.offset, " Length: ", 
testCase.length);
+    KeyColumnArray sliced = kc_array.Slice(testCase.offset, testCase.length);
+
+    // Extract binary data from the sliced KeyColumnArray
+    std::vector<std::string> sliced_data;
+    const auto* offset_data = reinterpret_cast<const 
OffsetType*>(sliced.data(1));
+    const auto* string_data = reinterpret_cast<const char*>(sliced.data(2));
+
+    for (auto i = 0; i < testCase.length; ++i) {
+      auto start = offset_data[i];
+      auto end = offset_data[i + 1];
+      sliced_data.push_back(std::string(string_data + start, string_data + 
end));
+    }
+
+    // Compare the sliced values to the expected string
+    ASSERT_EQ(testCase.expected, sliced_data);
+  }
+}
+
+TEST(KeyColumnArray, SliceBinaryTest) {
+  const char* json_test_strings = R"(["Hello", "World", "Slice", "Binary", 
"Test"])";
+  std::vector<SliceTestCase> testCases = {
+      {0, 1, {"Hello"}},
+      {1, 1, {"World"}},
+      {2, 1, {"Slice"}},
+      {3, 1, {"Binary"}},
+      {4, 1, {"Test"}},
+      {0, 2, {"Hello", "World"}},
+      {1, 2, {"World", "Slice"}},
+      {2, 2, {"Slice", "Binary"}},
+      {3, 2, {"Binary", "Test"}},
+      {0, 3, {"Hello", "World", "Slice"}},
+      {1, 3, {"World", "Slice", "Binary"}},
+      {2, 3, {"Slice", "Binary", "Test"}},
+      {0, 4, {"Hello", "World", "Slice", "Binary"}},
+      {1, 4, {"World", "Slice", "Binary", "Test"}},
+      {0, 5, {"Hello", "World", "Slice", "Binary", "Test"}},
+  };
+
+  // Run tests with binary type
+  GenericTestSlice<int32_t>(binary(), json_test_strings, testCases);
+
+  // Run tests with large binary type
+  GenericTestSlice<int64_t>(large_binary(), json_test_strings, testCases);
+}
+
 TEST(ResizableArrayData, Basic) {
   std::unique_ptr<MemoryPool> pool = MemoryPool::CreateDefault();
   for (const auto& type : kSampleFixedDataTypes) {

[arrow] branch main updated: GH-38074: [C++] Fix Offset Size Calculation for Slicing Large String and Binary Types in Hash Join (#38147)

Reply via email to