pitrou commented on code in PR #49375:
URL: https://github.com/apache/arrow/pull/49375#discussion_r2847874778
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,115 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+std::shared_ptr<Array> MakeOversizedBuffer(int32_t data_length) {
Review Comment:
This is creating an array rather than a buffer, perhaps change the name to
reflect that?
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,115 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+std::shared_ptr<Array> MakeOversizedBuffer(int32_t data_length) {
+ auto offsets = Buffer::FromVector(std::vector<int32_t>{0, data_length});
+ auto values = Buffer::FromString("x");
+ return MakeArray(
+ ArrayData::Make(binary(), 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+Result<std::shared_ptr<Array>> MakeAllocatedVarBinaryArray(
+ const std::shared_ptr<DataType>& type, int64_t data_length) {
+ ARROW_ASSIGN_OR_RAISE(auto values, AllocateBuffer(data_length));
+ if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+ if (data_length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("data_length exceeds int32 offset range");
+ }
+ auto offsets =
+ Buffer::FromVector(std::vector<int32_t>{0,
static_cast<int32_t>(data_length)});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+ }
+ if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+ return Status::TypeError("unsupported var-binary type for helper: ",
*type);
+ }
+ auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+TEST_F(TestIfElseKernel, IfElseStringAAAAllocatedDataCapacityError) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+
+ auto string_type = TypeTraits<StringType>::type_singleton();
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ ASSERT_OK_AND_ASSIGN(auto left, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ CapacityError,
+ ::testing::HasSubstr("Result may exceed offset capacity for this type"),
+ CallFunction("if_else", {cond, left, right}));
+}
+
+TEST_F(TestIfElseKernel, IfElseLargeStringAAADoesNotCapacityOverflow) {
Review Comment:
You can re-use a common helper with the test above, to cut down on code
duplication.
##########
cpp/src/arrow/compute/kernels/scalar_if_else.cc:
##########
@@ -745,18 +760,22 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
ctx->Allocate(offset_length));
std::memcpy(out_data->buffers[1]->mutable_data(), right_offsets,
offset_length);
- auto right_data_length = right_offsets[right.length] - right_offsets[0];
+ int64_t right_data_length =
static_cast<int64_t>(right_offsets[right.length]) -
+ static_cast<int64_t>(right_offsets[0]);
+ ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(right_data_length));
Review Comment:
I don't think this is required? We know `right` already fits in 2GiB, so the
result here cannot possibly overflow.
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,115 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+std::shared_ptr<Array> MakeOversizedBuffer(int32_t data_length) {
+ auto offsets = Buffer::FromVector(std::vector<int32_t>{0, data_length});
+ auto values = Buffer::FromString("x");
+ return MakeArray(
+ ArrayData::Make(binary(), 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+Result<std::shared_ptr<Array>> MakeAllocatedVarBinaryArray(
+ const std::shared_ptr<DataType>& type, int64_t data_length) {
+ ARROW_ASSIGN_OR_RAISE(auto values, AllocateBuffer(data_length));
+ if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+ if (data_length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("data_length exceeds int32 offset range");
+ }
+ auto offsets =
+ Buffer::FromVector(std::vector<int32_t>{0,
static_cast<int32_t>(data_length)});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+ }
+ if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+ return Status::TypeError("unsupported var-binary type for helper: ",
*type);
+ }
+ auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+TEST_F(TestIfElseKernel, IfElseStringAAAAllocatedDataCapacityError) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+
+ auto string_type = TypeTraits<StringType>::type_singleton();
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ ASSERT_OK_AND_ASSIGN(auto left, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ CapacityError,
+ ::testing::HasSubstr("Result may exceed offset capacity for this type"),
+ CallFunction("if_else", {cond, left, right}));
+}
+
+TEST_F(TestIfElseKernel, IfElseLargeStringAAADoesNotCapacityOverflow) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+
+ auto large_string_type = TypeTraits<LargeStringType>::type_singleton();
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ ASSERT_OK_AND_ASSIGN(auto left,
+ MakeAllocatedVarBinaryArray(large_string_type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right,
+ MakeAllocatedVarBinaryArray(large_string_type,
kPerSide));
+
+ auto maybe_out = CallFunction("if_else", {cond, left, right});
+ ASSERT_FALSE(maybe_out.status().IsCapacityError()) << maybe_out.status();
Review Comment:
Should simply be ok, no?
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,115 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+std::shared_ptr<Array> MakeOversizedBuffer(int32_t data_length) {
+ auto offsets = Buffer::FromVector(std::vector<int32_t>{0, data_length});
+ auto values = Buffer::FromString("x");
+ return MakeArray(
+ ArrayData::Make(binary(), 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+Result<std::shared_ptr<Array>> MakeAllocatedVarBinaryArray(
+ const std::shared_ptr<DataType>& type, int64_t data_length) {
+ ARROW_ASSIGN_OR_RAISE(auto values, AllocateBuffer(data_length));
+ if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+ if (data_length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("data_length exceeds int32 offset range");
+ }
+ auto offsets =
+ Buffer::FromVector(std::vector<int32_t>{0,
static_cast<int32_t>(data_length)});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+ }
+ if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+ return Status::TypeError("unsupported var-binary type for helper: ",
*type);
+ }
+ auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+TEST_F(TestIfElseKernel, IfElseStringAAAAllocatedDataCapacityError) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+
+ auto string_type = TypeTraits<StringType>::type_singleton();
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ ASSERT_OK_AND_ASSIGN(auto left, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ CapacityError,
+ ::testing::HasSubstr("Result may exceed offset capacity for this type"),
+ CallFunction("if_else", {cond, left, right}));
+}
+
+TEST_F(TestIfElseKernel, IfElseLargeStringAAADoesNotCapacityOverflow) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+
+ auto large_string_type = TypeTraits<LargeStringType>::type_singleton();
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ ASSERT_OK_AND_ASSIGN(auto left,
+ MakeAllocatedVarBinaryArray(large_string_type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right,
+ MakeAllocatedVarBinaryArray(large_string_type,
kPerSide));
+
+ auto maybe_out = CallFunction("if_else", {cond, left, right});
+ ASSERT_FALSE(maybe_out.status().IsCapacityError()) << maybe_out.status();
+}
+
+TEST_F(TestIfElseKernel, IfElseBinaryCapacityErrorAAA) {
+ constexpr int32_t capacity_limit = std::numeric_limits<int32_t>::max();
+
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ auto left = MakeOversizedBuffer(capacity_limit);
Review Comment:
Why not use `MakeAllocatedVarBinaryArray`?
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,115 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+std::shared_ptr<Array> MakeOversizedBuffer(int32_t data_length) {
+ auto offsets = Buffer::FromVector(std::vector<int32_t>{0, data_length});
+ auto values = Buffer::FromString("x");
+ return MakeArray(
+ ArrayData::Make(binary(), 1, {nullptr, std::move(offsets),
std::move(values)}));
Review Comment:
Ok, so the array is technically invalid as the data buffer is too short, add
a comment mentioning that?
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,115 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+std::shared_ptr<Array> MakeOversizedBuffer(int32_t data_length) {
+ auto offsets = Buffer::FromVector(std::vector<int32_t>{0, data_length});
+ auto values = Buffer::FromString("x");
+ return MakeArray(
+ ArrayData::Make(binary(), 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+Result<std::shared_ptr<Array>> MakeAllocatedVarBinaryArray(
+ const std::shared_ptr<DataType>& type, int64_t data_length) {
+ ARROW_ASSIGN_OR_RAISE(auto values, AllocateBuffer(data_length));
+ if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+ if (data_length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("data_length exceeds int32 offset range");
+ }
+ auto offsets =
+ Buffer::FromVector(std::vector<int32_t>{0,
static_cast<int32_t>(data_length)});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+ }
+ if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+ return Status::TypeError("unsupported var-binary type for helper: ",
*type);
+ }
+ auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+TEST_F(TestIfElseKernel, IfElseStringAAAAllocatedDataCapacityError) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+
+ auto string_type = TypeTraits<StringType>::type_singleton();
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ ASSERT_OK_AND_ASSIGN(auto left, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right, MakeAllocatedVarBinaryArray(string_type,
kPerSide));
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ CapacityError,
+ ::testing::HasSubstr("Result may exceed offset capacity for this type"),
+ CallFunction("if_else", {cond, left, right}));
+}
+
+TEST_F(TestIfElseKernel, IfElseLargeStringAAADoesNotCapacityOverflow) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+
+ auto large_string_type = TypeTraits<LargeStringType>::type_singleton();
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ ASSERT_OK_AND_ASSIGN(auto left,
+ MakeAllocatedVarBinaryArray(large_string_type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right,
+ MakeAllocatedVarBinaryArray(large_string_type,
kPerSide));
+
+ auto maybe_out = CallFunction("if_else", {cond, left, right});
+ ASSERT_FALSE(maybe_out.status().IsCapacityError()) << maybe_out.status();
+}
+
+TEST_F(TestIfElseKernel, IfElseBinaryCapacityErrorAAA) {
Review Comment:
Is this test checking something different than
`IfElseStringAAAAllocatedDataCapacityError`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]