Repository: arrow Updated Branches: refs/heads/master 0ab425245 -> 42b55d98c
ARROW-544: [C++] Test writing zero-length record batches, zero-length BinaryArray fixes I believe this should fix the failure reported in the Spark integration work. We'll need to upgrade the conda test packages to verify. cc @BryanCutler Author: Wes McKinney <[email protected]> Closes #333 from wesm/ARROW-544 and squashes the following commits: f80d58f [Wes McKinney] Protect zero-length record batches from incomplete buffer metadata f876dce [Wes McKinney] Test with null value_offsets too 1dc7733 [Wes McKinney] Test writing zero-length record batches, misc zero-length fixes Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/42b55d98 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/42b55d98 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/42b55d98 Branch: refs/heads/master Commit: 42b55d98c151d87e5a7c6442800f3e5b9316499b Parents: 0ab4252 Author: Wes McKinney <[email protected]> Authored: Fri Feb 10 13:15:39 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Fri Feb 10 13:15:39 2017 -0500 ---------------------------------------------------------------------- cpp/src/arrow/array-string-test.cc | 4 ++ cpp/src/arrow/array.cc | 11 +++- cpp/src/arrow/ipc/adapter.cc | 27 ++++++--- cpp/src/arrow/ipc/ipc-adapter-test.cc | 89 +++++++++++++++++++++--------- 4 files changed, 94 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/array-string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index c4d9bf4..d8a3585 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -470,4 +470,8 @@ TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality<BinaryType>(); } +TEST_F(TestBinaryArray, LengthZeroCtor) { + BinaryArray array(0, nullptr, nullptr); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index bf368d9..81678e3 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -270,9 +270,12 @@ BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int32_t length, const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset) : Array(type, length, null_bitmap, null_count, offset), value_offsets_(value_offsets), - raw_value_offsets_(reinterpret_cast<const int32_t*>(value_offsets_->data())), + raw_value_offsets_(nullptr), data_(data), raw_data_(nullptr) { + if (value_offsets_ != nullptr) { + raw_value_offsets_ = reinterpret_cast<const int32_t*>(value_offsets_->data()); + } if (data_ != nullptr) { raw_data_ = data_->data(); } } @@ -384,8 +387,10 @@ UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int32_t length, : Array(type, length, null_bitmap, null_count, offset), children_(children), type_ids_(type_ids), - value_offsets_(value_offsets) { - raw_type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data()); + raw_type_ids_(nullptr), + value_offsets_(value_offsets), + raw_value_offsets_(nullptr) { + if (type_ids) { raw_type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data()); } if (value_offsets) { raw_value_offsets_ = reinterpret_cast<const int32_t*>(value_offsets->data()); } http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/ipc/adapter.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 3613ccb..f36ff37 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -602,8 +602,13 @@ class ArrayLoader : public TypeVisitor { std::shared_ptr<Buffer> offsets; std::shared_ptr<Buffer> values; - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets)); - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &values)); + if (field_meta.length > 0) { + RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets)); + RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &values)); + } else { + context_->buffer_index += 2; + offsets = values = nullptr; + } result_ = std::make_shared<CONTAINER>( field_meta.length, offsets, values, null_bitmap, field_meta.null_count); @@ -661,7 +666,12 @@ class ArrayLoader : public TypeVisitor { RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap)); std::shared_ptr<Buffer> offsets; - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets)); + if (field_meta.length > 0) { + RETURN_NOT_OK(GetBuffer(context_->buffer_index, &offsets)); + } else { + offsets = nullptr; + } + ++context_->buffer_index; const int num_children = type.num_children(); if (num_children != 1) { @@ -708,13 +718,16 @@ class ArrayLoader : public TypeVisitor { std::shared_ptr<Buffer> null_bitmap; RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap)); - std::shared_ptr<Buffer> type_ids; + std::shared_ptr<Buffer> type_ids = nullptr; std::shared_ptr<Buffer> offsets = nullptr; - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &type_ids)); - if (type.mode == UnionMode::DENSE) { - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets)); + if (field_meta.length > 0) { + RETURN_NOT_OK(GetBuffer(context_->buffer_index, &type_ids)); + if (type.mode == UnionMode::DENSE) { + RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &offsets)); + } } + context_->buffer_index += type.mode == UnionMode::DENSE? 2 : 1; std::vector<std::shared_ptr<Array>> fields; RETURN_NOT_OK(LoadChildren(type.children(), &fields)); http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/ipc/ipc-adapter-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index bae6578..d11b95b 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -71,6 +71,42 @@ class TestWriteRecordBatch : public ::testing::TestWithParam<MakeRecordBatch*>, return ReadRecordBatch(metadata, batch.schema(), &buffer_reader, batch_result); } + void CheckRoundtrip(const RecordBatch& batch, int64_t buffer_size) { + std::shared_ptr<RecordBatch> batch_result; + + ASSERT_OK(RoundTripHelper(batch, 1 << 16, &batch_result)); + EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); + + ASSERT_TRUE(batch.schema()->Equals(batch_result->schema())); + ASSERT_EQ(batch.num_columns(), batch_result->num_columns()) + << batch.schema()->ToString() + << " result: " << batch_result->schema()->ToString(); + + for (int i = 0; i < batch.num_columns(); ++i) { + const auto& left = *batch.column(i); + const auto& right = *batch_result->column(i); + if (!left.Equals(right)) { + std::stringstream pp_result; + std::stringstream pp_expected; + + ASSERT_OK(PrettyPrint(left, 0, &pp_expected)); + ASSERT_OK(PrettyPrint(right, 0, &pp_result)); + + FAIL() << "Index: " << i << " Expected: " << pp_expected.str() + << "\nGot: " << pp_result.str(); + } + } + } + + void CheckRoundtrip(const std::shared_ptr<Array>& array, int64_t buffer_size) { + auto f0 = arrow::field("f0", array->type()); + std::vector<std::shared_ptr<Field>> fields = {f0}; + auto schema = std::make_shared<Schema>(fields); + + RecordBatch batch(schema, 0, {array}); + CheckRoundtrip(batch, buffer_size); + } + protected: std::shared_ptr<io::MemoryMappedFile> mmap_; MemoryPool* pool_; @@ -79,48 +115,47 @@ class TestWriteRecordBatch : public ::testing::TestWithParam<MakeRecordBatch*>, TEST_P(TestWriteRecordBatch, RoundTrip) { std::shared_ptr<RecordBatch> batch; ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue - std::shared_ptr<RecordBatch> batch_result; - ASSERT_OK(RoundTripHelper(*batch, 1 << 16, &batch_result)); - - // do checks - ASSERT_TRUE(batch->schema()->Equals(batch_result->schema())); - ASSERT_EQ(batch->num_columns(), batch_result->num_columns()) - << batch->schema()->ToString() << " result: " << batch_result->schema()->ToString(); - EXPECT_EQ(batch->num_rows(), batch_result->num_rows()); - for (int i = 0; i < batch->num_columns(); ++i) { - EXPECT_TRUE(batch->column(i)->Equals(batch_result->column(i))) - << "Idx: " << i << " Name: " << batch->column_name(i); - } + + CheckRoundtrip(*batch, 1 << 20); } TEST_P(TestWriteRecordBatch, SliceRoundTrip) { std::shared_ptr<RecordBatch> batch; ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue - std::shared_ptr<RecordBatch> batch_result; // Skip the zero-length case if (batch->num_rows() < 2) { return; } auto sliced_batch = batch->Slice(2, 10); + CheckRoundtrip(*sliced_batch, 1 << 20); +} + +TEST_P(TestWriteRecordBatch, ZeroLengthArrays) { + std::shared_ptr<RecordBatch> batch; + ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue - ASSERT_OK(RoundTripHelper(*sliced_batch, 1 << 16, &batch_result)); + std::shared_ptr<RecordBatch> zero_length_batch; + if (batch->num_rows() > 2) { + zero_length_batch = batch->Slice(2, 0); + } else { + zero_length_batch = batch->Slice(0, 0); + } - EXPECT_EQ(sliced_batch->num_rows(), batch_result->num_rows()); + CheckRoundtrip(*zero_length_batch, 1 << 20); - for (int i = 0; i < sliced_batch->num_columns(); ++i) { - const auto& left = *sliced_batch->column(i); - const auto& right = *batch_result->column(i); - if (!left.Equals(right)) { - std::stringstream pp_result; - std::stringstream pp_expected; + // ARROW-544: check binary array + std::shared_ptr<MutableBuffer> value_offsets; + ASSERT_OK(AllocateBuffer(pool_, sizeof(int32_t), &value_offsets)); + *reinterpret_cast<int32_t*>(value_offsets->mutable_data()) = 0; - ASSERT_OK(PrettyPrint(left, 0, &pp_expected)); - ASSERT_OK(PrettyPrint(right, 0, &pp_result)); + std::shared_ptr<Array> bin_array = std::make_shared<BinaryArray>(0, value_offsets, + std::make_shared<Buffer>(nullptr, 0), std::make_shared<Buffer>(nullptr, 0)); - FAIL() << "Index: " << i << " Expected: " << pp_expected.str() - << "\nGot: " << pp_result.str(); - } - } + // null value_offsets + std::shared_ptr<Array> bin_array2 = std::make_shared<BinaryArray>(0, nullptr, nullptr); + + CheckRoundtrip(bin_array, 1 << 20); + CheckRoundtrip(bin_array2, 1 << 20); } INSTANTIATE_TEST_CASE_P(
