This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new fb26178332 GH-38074: [C++] Fix Offset Size Calculation for Slicing
Large String and Binary Types in Hash Join (#38147)
fb26178332 is described below
commit fb2617833200d9c90b6d79a85ae9e840b41938da
Author: Hyunseok Seo <[email protected]>
AuthorDate: Mon Oct 16 17:57:09 2023 +0900
GH-38074: [C++] Fix Offset Size Calculation for Slicing Large String and
Binary Types in Hash Join (#38147)
### Rationale for this change
We found that the wrong results in inner joins during hash join operations
were caused by a problem with how large strings and binary types were handled.
The `Slice` function was not calculating their sizes correctly.
To fix this, I changed the `Slice` function to calculate the sizes
correctly, based on the type of data for large string and binary.
* Issue raised: #37729
### What changes are included in this PR?
* The `Slice` function has been updated to correctly calculate the offset
for Large String and Large Binary types, and assertion statements have been
added to improve maintainability.
* Unit tests (`TEST(KeyColumnArray, SliceBinaryTest)`)for the Slice
function have been added.
* During random tests for Hash Join (`TEST(HashJoin, Random)`),
modifications were made to allow the creation of Large String as key column
values.
### Are these changes tested?
Yes
### Are there any user-facing changes?
Acero might not have a large user base as it is an experimental feature,
but I deemed the issue of incorrect join results as critical and have addressed
the bug.
* Closes: #38074
Authored-by: Hyunseok Seo <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/acero/hash_join_node_test.cc | 27 ++++++++++++--
cpp/src/arrow/compute/light_array.cc | 12 ++++--
cpp/src/arrow/compute/light_array.h | 25 +++++++++++++
cpp/src/arrow/compute/light_array_test.cc | 60 ++++++++++++++++++++++++++++++
4 files changed, 116 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc
b/cpp/src/arrow/acero/hash_join_node_test.cc
index 960d3838fb..58551f4eca 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -253,7 +253,8 @@ struct RandomDataTypeConstraints {
int max_string_length;
void Default() {
- data_type_enabled_mask = kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary |
kString;
+ data_type_enabled_mask =
+ kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString |
kLargeString;
min_null_probability = 0.0;
max_null_probability = 0.2;
min_binary_length = 1;
@@ -289,6 +290,7 @@ struct RandomDataTypeConstraints {
static constexpr int64_t kBool = 16;
static constexpr int64_t kBinary = 32;
static constexpr int64_t kString = 64;
+ static constexpr int64_t kLargeString = 128;
};
struct RandomDataType {
@@ -297,6 +299,7 @@ struct RandomDataType {
int fixed_length;
int min_string_length;
int max_string_length;
+ bool is_large_string;
static RandomDataType Random(Random64Bit& rng,
const RandomDataTypeConstraints& constraints) {
@@ -312,6 +315,15 @@ struct RandomDataType {
} else {
result.is_fixed_length = true;
}
+
+ if (!result.is_fixed_length &&
+ (constraints.data_type_enabled_mask & constraints.kLargeString) != 0) {
+ // When selecting the string type, there's a 50% chance of choosing a
large string.
+ result.is_large_string = ((rng.next() % 2) == 0);
+ } else {
+ result.is_large_string = false;
+ }
+
if (constraints.max_null_probability > 0.0) {
// 25% chance of no nulls
// Uniform distribution of null probability from min to max
@@ -405,9 +417,16 @@ std::vector<std::shared_ptr<Array>> GenRandomRecords(
break;
}
} else {
- result.push_back(rag.String(num_rows, data_types[i].min_string_length,
- data_types[i].max_string_length,
- data_types[i].null_probability));
+ if (data_types[i].is_large_string) {
+ // Generate LargeString if is_large_string flag is true
+ result.push_back(rag.LargeString(num_rows,
data_types[i].min_string_length,
+ data_types[i].max_string_length,
+ data_types[i].null_probability));
+ } else {
+ result.push_back(rag.String(num_rows, data_types[i].min_string_length,
+ data_types[i].max_string_length,
+ data_types[i].null_probability));
+ }
}
}
return result;
diff --git a/cpp/src/arrow/compute/light_array.cc
b/cpp/src/arrow/compute/light_array.cc
index 37d4421fd7..4e8b2b2d7c 100644
--- a/cpp/src/arrow/compute/light_array.cc
+++ b/cpp/src/arrow/compute/light_array.cc
@@ -80,8 +80,7 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t
length) const {
KeyColumnArray sliced;
sliced.metadata_ = metadata_;
sliced.length_ = length;
- uint32_t fixed_size =
- !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
+ uint32_t fixed_size = metadata_.fixed_length;
sliced.buffers_[0] =
buffers_[0] ? buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr;
@@ -89,18 +88,23 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset,
int64_t length) const {
mutable_buffers_[0] ? mutable_buffers_[0] + (bit_offset_[0] + offset) /
8 : nullptr;
sliced.bit_offset_[0] = (bit_offset_[0] + offset) % 8;
- if (fixed_size == 0 && !metadata_.is_null_type) {
+ if (metadata_.fixed_length == 0 && !metadata_.is_null_type) {
+ ARROW_DCHECK(is_bool_type()) << "Expected BOOL type type but got a
different type.";
sliced.buffers_[1] =
buffers_[1] ? buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr;
sliced.mutable_buffers_[1] = mutable_buffers_[1]
? mutable_buffers_[1] + (bit_offset_[1] +
offset) / 8
: nullptr;
sliced.bit_offset_[1] = (bit_offset_[1] + offset) % 8;
- } else {
+ } else if (metadata_.fixed_length > 0) {
+ ARROW_DCHECK(is_binary_type() || is_large_binary_type() ||
is_fixed_width_types())
+ << "Expected (LARGE) BINARY or FIXED WIDTH type but got a different
type.";
sliced.buffers_[1] = buffers_[1] ? buffers_[1] + offset * fixed_size :
nullptr;
sliced.mutable_buffers_[1] =
mutable_buffers_[1] ? mutable_buffers_[1] + offset * fixed_size :
nullptr;
sliced.bit_offset_[1] = 0;
+ } else {
+ ARROW_DCHECK(is_null_type()) << "Expected Null type but got a different
type.";
}
sliced.buffers_[2] = buffers_[2];
diff --git a/cpp/src/arrow/compute/light_array.h
b/cpp/src/arrow/compute/light_array.h
index 3dd328c22e..87f6b6c76a 100644
--- a/cpp/src/arrow/compute/light_array.h
+++ b/cpp/src/arrow/compute/light_array.h
@@ -183,6 +183,31 @@ class ARROW_EXPORT KeyColumnArray {
// Starting bit offset within the first byte (between 0 and 7)
// to be used when accessing buffers that store bit vectors.
int bit_offset_[kMaxBuffers - 1];
+
+ bool is_bool_type() const {
+ return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
+ !metadata_.is_null_type;
+ }
+
+ bool is_fixed_width_types() const {
+ return metadata_.is_fixed_length && metadata_.fixed_length != 0 &&
+ !metadata_.is_null_type;
+ }
+
+ bool is_binary_type() const {
+ return !metadata_.is_fixed_length && metadata_.fixed_length ==
sizeof(uint32_t) &&
+ !metadata_.is_null_type;
+ }
+
+ bool is_large_binary_type() const {
+ return !metadata_.is_fixed_length && metadata_.fixed_length ==
sizeof(uint64_t) &&
+ !metadata_.is_null_type;
+ }
+
+ bool is_null_type() const {
+ return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
+ metadata_.is_null_type;
+ }
};
/// \brief Create KeyColumnMetadata from a DataType
diff --git a/cpp/src/arrow/compute/light_array_test.cc
b/cpp/src/arrow/compute/light_array_test.cc
index 71d228bf3f..4e33f7b578 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -226,6 +226,66 @@ TEST(KeyColumnArray, SliceBool) {
}
}
+struct SliceTestCase {
+ int offset;
+ int length;
+ std::vector<std::string> expected;
+};
+
+template <typename OffsetType>
+void GenericTestSlice(const std::shared_ptr<DataType>& type, const char*
json_data,
+ const std::vector<SliceTestCase>& testCases) {
+ auto array = ArrayFromJSON(type, json_data);
+ KeyColumnArray kc_array =
+ ColumnArrayFromArrayData(array->data(), 0, array->length()).ValueOrDie();
+
+ for (const auto& testCase : testCases) {
+ ARROW_SCOPED_TRACE("Offset: ", testCase.offset, " Length: ",
testCase.length);
+ KeyColumnArray sliced = kc_array.Slice(testCase.offset, testCase.length);
+
+ // Extract binary data from the sliced KeyColumnArray
+ std::vector<std::string> sliced_data;
+ const auto* offset_data = reinterpret_cast<const
OffsetType*>(sliced.data(1));
+ const auto* string_data = reinterpret_cast<const char*>(sliced.data(2));
+
+ for (auto i = 0; i < testCase.length; ++i) {
+ auto start = offset_data[i];
+ auto end = offset_data[i + 1];
+ sliced_data.push_back(std::string(string_data + start, string_data +
end));
+ }
+
+ // Compare the sliced values to the expected string
+ ASSERT_EQ(testCase.expected, sliced_data);
+ }
+}
+
+TEST(KeyColumnArray, SliceBinaryTest) {
+ const char* json_test_strings = R"(["Hello", "World", "Slice", "Binary",
"Test"])";
+ std::vector<SliceTestCase> testCases = {
+ {0, 1, {"Hello"}},
+ {1, 1, {"World"}},
+ {2, 1, {"Slice"}},
+ {3, 1, {"Binary"}},
+ {4, 1, {"Test"}},
+ {0, 2, {"Hello", "World"}},
+ {1, 2, {"World", "Slice"}},
+ {2, 2, {"Slice", "Binary"}},
+ {3, 2, {"Binary", "Test"}},
+ {0, 3, {"Hello", "World", "Slice"}},
+ {1, 3, {"World", "Slice", "Binary"}},
+ {2, 3, {"Slice", "Binary", "Test"}},
+ {0, 4, {"Hello", "World", "Slice", "Binary"}},
+ {1, 4, {"World", "Slice", "Binary", "Test"}},
+ {0, 5, {"Hello", "World", "Slice", "Binary", "Test"}},
+ };
+
+ // Run tests with binary type
+ GenericTestSlice<int32_t>(binary(), json_test_strings, testCases);
+
+ // Run tests with large binary type
+ GenericTestSlice<int64_t>(large_binary(), json_test_strings, testCases);
+}
+
TEST(ResizableArrayData, Basic) {
std::unique_ptr<MemoryPool> pool = MemoryPool::CreateDefault();
for (const auto& type : kSampleFixedDataTypes) {