This is an automated email from the ASF dual-hosted git repository.
westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 196fbe5f5e GH-34639: [C++] Support RecordBatch::FromStructArray even
if struct array has nulls/offsets (#34691)
196fbe5f5e is described below
commit 196fbe5f5e0669b1f8bf71005c8a4c6a4e0fb2da
Author: Weston Pace <[email protected]>
AuthorDate: Mon Mar 27 13:42:47 2023 -0700
GH-34639: [C++] Support RecordBatch::FromStructArray even if struct array
has nulls/offsets (#34691)
### Rationale for this change
A struct array can have a validity map and an offset. A record batch
cannot. When converting from a struct array to a record batch we were throwing
an error if a validity map was present and returning the wrong data if an
offset was present.
### What changes are included in this PR?
If a validity map or offset are present then StructArray::Flatten is used
to push the offset and validity map into the struct array. Note: this means
that RecordBatch::FromStructArray will not be zero-copy if it has to push down
a validity map.
### Are these changes tested?
Yes
### Are there any user-facing changes?
Yes, RecordBatch::FromStructArray now takes in a memory pool because it
might have to make allocations when pushing validity bitmaps down.
* Closes: #34639
Authored-by: Weston Pace <[email protected]>
Signed-off-by: Weston Pace <[email protected]>
---
cpp/src/arrow/json/reader.cc | 5 +++--
cpp/src/arrow/record_batch.cc | 14 ++++++++++----
cpp/src/arrow/record_batch.h | 12 +++++++++---
cpp/src/arrow/record_batch_test.cc | 19 ++++++++++++++++++-
4 files changed, 40 insertions(+), 10 deletions(-)
diff --git a/cpp/src/arrow/json/reader.cc b/cpp/src/arrow/json/reader.cc
index dae06d5bf6..fe3c48b1ae 100644
--- a/cpp/src/arrow/json/reader.cc
+++ b/cpp/src/arrow/json/reader.cc
@@ -317,7 +317,8 @@ class DecodingOperator {
std::shared_ptr<ChunkedArray> chunked;
RETURN_NOT_OK(builder->Finish(&chunked));
- ARROW_ASSIGN_OR_RAISE(auto batch,
RecordBatch::FromStructArray(chunked->chunk(0)));
+ ARROW_ASSIGN_OR_RAISE(
+ auto batch, RecordBatch::FromStructArray(chunked->chunk(0),
context_->pool()));
return DecodedBlock{std::move(batch), num_bytes};
}
@@ -552,7 +553,7 @@ Result<std::shared_ptr<RecordBatch>> ParseOne(ParseOptions
options,
std::shared_ptr<ChunkedArray> converted_chunked;
RETURN_NOT_OK(builder->Finish(&converted_chunked));
- return RecordBatch::FromStructArray(converted_chunked->chunk(0));
+ return RecordBatch::FromStructArray(converted_chunked->chunk(0),
context.pool());
}
} // namespace json
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 28245c8f5d..c24a19e2f4 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -198,14 +198,20 @@ Result<std::shared_ptr<RecordBatch>>
RecordBatch::MakeEmpty(
}
Result<std::shared_ptr<RecordBatch>> RecordBatch::FromStructArray(
- const std::shared_ptr<Array>& array) {
+ const std::shared_ptr<Array>& array, MemoryPool* memory_pool) {
if (array->type_id() != Type::STRUCT) {
return Status::TypeError("Cannot construct record batch from array of type
",
*array->type());
}
- if (array->null_count() != 0) {
- return Status::Invalid(
- "Unable to construct record batch from a StructArray with non-zero
nulls.");
+ if (array->null_count() != 0 || array->offset() != 0) {
+ // If the struct array has a validity map or offset we need to push those
into
+ // the child arrays via Flatten since the RecordBatch doesn't have
validity/offset
+ const std::shared_ptr<StructArray>& struct_array =
+ internal::checked_pointer_cast<StructArray>(array);
+ ARROW_ASSIGN_OR_RAISE(std::vector<std::shared_ptr<Array>> fields,
+ struct_array->Flatten(memory_pool));
+ return Make(arrow::schema(array->type()->fields()), array->length(),
+ std::move(fields));
}
return Make(arrow::schema(array->type()->fields()), array->length(),
array->data()->child_data);
diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h
index 8bc7032256..a5121ab397 100644
--- a/cpp/src/arrow/record_batch.h
+++ b/cpp/src/arrow/record_batch.h
@@ -82,10 +82,16 @@ class ARROW_EXPORT RecordBatch {
/// \brief Construct record batch from struct array
///
/// This constructs a record batch using the child arrays of the given
- /// array, which must be a struct array. Note that the struct array's own
- /// null bitmap is not reflected in the resulting record batch.
+ /// array, which must be a struct array.
+ ///
+ /// \param[in] array the source array, must be a StructArray
+ /// \param[in] pool the memory pool to allocate new validity bitmaps
+ ///
+ /// This operation will usually be zero-copy. However, if the struct array
has an
+ /// offset or a validity bitmap then these will need to be pushed into the
child arrays.
+ /// Pushing the offset is zero-copy but pushing the validity bitmap is not.
static Result<std::shared_ptr<RecordBatch>> FromStructArray(
- const std::shared_ptr<Array>& array);
+ const std::shared_ptr<Array>& array, MemoryPool* pool =
default_memory_pool());
/// \brief Determine if two record batches are exactly equal
///
diff --git a/cpp/src/arrow/record_batch_test.cc
b/cpp/src/arrow/record_batch_test.cc
index ed2f3e552e..9dab74c223 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -331,6 +331,19 @@ TEST_F(TestRecordBatch, ToFromEmptyStructArray) {
ASSERT_TRUE(batch1->Equals(*batch2));
}
+TEST_F(TestRecordBatch, FromSlicedStructArray) {
+ static constexpr int64_t kLength = 10;
+ std::shared_ptr<Array> x_arr = ArrayFromJSON(int64(), "[0, 1, 2, 3, 4, 5, 6,
7, 8, 9]");
+ StructArray struct_array(struct_({field("x", int64())}), kLength, {x_arr});
+ std::shared_ptr<Array> sliced = struct_array.Slice(5, 3);
+ ASSERT_OK_AND_ASSIGN(auto batch, RecordBatch::FromStructArray(sliced));
+
+ std::shared_ptr<Array> expected_arr = ArrayFromJSON(int64(), "[5, 6, 7]");
+ std::shared_ptr<RecordBatch> expected =
+ RecordBatch::Make(schema({field("x", int64())}), 3, {expected_arr});
+ AssertBatchesEqual(*expected, *batch);
+}
+
TEST_F(TestRecordBatch, FromStructArrayInvalidType) {
random::RandomArrayGenerator gen(42);
ASSERT_RAISES(TypeError, RecordBatch::FromStructArray(gen.ArrayOf(int32(),
6)));
@@ -339,7 +352,11 @@ TEST_F(TestRecordBatch, FromStructArrayInvalidType) {
TEST_F(TestRecordBatch, FromStructArrayInvalidNullCount) {
auto struct_array =
ArrayFromJSON(struct_({field("f1", int32())}), R"([{"f1": 1}, null])");
- ASSERT_RAISES(Invalid, RecordBatch::FromStructArray(struct_array));
+ ASSERT_OK_AND_ASSIGN(auto batch, RecordBatch::FromStructArray(struct_array));
+ std::shared_ptr<Array> expected_arr = ArrayFromJSON(int32(), "[1, null]");
+ std::shared_ptr<RecordBatch> expected =
+ RecordBatch::Make(schema({field("f1", int32())}), 2, {expected_arr});
+ AssertBatchesEqual(*expected, *batch);
}
TEST_F(TestRecordBatch, MakeEmpty) {