This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 72c7f5d ARROW-2454: [C++] Allow zero-array chunked arrays 72c7f5d is described below commit 72c7f5d3bcacc6d36e309a3731deca48c3c5345d Author: Antoine Pitrou <anto...@python.org> AuthorDate: Mon Apr 16 17:35:20 2018 +0200 ARROW-2454: [C++] Allow zero-array chunked arrays This allows code to be more regular and less fragile. Also fix the chunked array slicing logic. Author: Antoine Pitrou <anto...@python.org> Closes #1897 from pitrou/ARROW-2454-zero-length-chunked-arrays and squashes the following commits: 4ad2c6f <Antoine Pitrou> ARROW-2454: Allow zero-array chunked arrays --- cpp/src/arrow/table-test.cc | 21 ++++++++++++++++++++- cpp/src/arrow/table.cc | 26 +++++++++++++++++++------- cpp/src/arrow/table.h | 4 +++- cpp/src/arrow/type.h | 4 ++-- python/pyarrow/tests/test_table.py | 4 ++++ 5 files changed, 48 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index b1cf6e5..0b9f75d 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -43,7 +43,9 @@ class TestChunkedArray : public TestBase { protected: virtual void Construct() { one_ = std::make_shared<ChunkedArray>(arrays_one_); - another_ = std::make_shared<ChunkedArray>(arrays_another_); + if (!arrays_another_.empty()) { + another_ = std::make_shared<ChunkedArray>(arrays_another_); + } } ArrayVector arrays_one_; @@ -121,6 +123,23 @@ TEST_F(TestChunkedArray, SliceEquals) { std::shared_ptr<ChunkedArray> slice2 = one_->Slice(75)->Slice(25)->Slice(25, 50); ASSERT_EQ(slice2->length(), 50); test::AssertChunkedEqual(*slice, *slice2); + + // Making empty slices of a ChunkedArray + std::shared_ptr<ChunkedArray> slice3 = one_->Slice(one_->length(), 99); + ASSERT_EQ(slice3->length(), 0); + ASSERT_EQ(slice3->num_chunks(), 0); + ASSERT_TRUE(slice3->type()->Equals(one_->type())); + + std::shared_ptr<ChunkedArray> slice4 = one_->Slice(10, 0); + ASSERT_EQ(slice4->length(), 0); + ASSERT_EQ(slice4->num_chunks(), 0); + ASSERT_TRUE(slice4->type()->Equals(one_->type())); + + // Slicing an empty ChunkedArray + std::shared_ptr<ChunkedArray> slice5 = slice4->Slice(0, 10); + ASSERT_EQ(slice5->length(), 0); + ASSERT_EQ(slice5->num_chunks(), 0); + ASSERT_TRUE(slice5->type()->Equals(one_->type())); } class TestColumn : public TestChunkedArray { diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index f6ac6dd..8af47ea 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -39,13 +39,25 @@ namespace arrow { ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { length_ = 0; null_count_ = 0; + DCHECK_GT(chunks.size(), 0) + << "cannot construct ChunkedArray from empty vector and omitted type"; + type_ = chunks[0]->type(); for (const std::shared_ptr<Array>& chunk : chunks) { length_ += chunk->length(); null_count_ += chunk->null_count(); } } -std::shared_ptr<DataType> ChunkedArray::type() const { return chunks_[0]->type(); } +ChunkedArray::ChunkedArray(const ArrayVector& chunks, + const std::shared_ptr<DataType>& type) + : chunks_(chunks), type_(type) { + length_ = 0; + null_count_ = 0; + for (const std::shared_ptr<Array>& chunk : chunks) { + length_ += chunk->length(); + null_count_ += chunk->null_count(); + } +} bool ChunkedArray::Equals(const ChunkedArray& other) const { if (length_ != other.length()) { @@ -107,20 +119,20 @@ std::shared_ptr<ChunkedArray> ChunkedArray::Slice(int64_t offset, int64_t length DCHECK_LE(offset, length_); int curr_chunk = 0; - while (offset >= chunk(curr_chunk)->length()) { + while (curr_chunk < num_chunks() && offset >= chunk(curr_chunk)->length()) { offset -= chunk(curr_chunk)->length(); curr_chunk++; } ArrayVector new_chunks; - while (length > 0 && curr_chunk < num_chunks()) { + while (curr_chunk < num_chunks() && length > 0) { new_chunks.push_back(chunk(curr_chunk)->Slice(offset, length)); length -= chunk(curr_chunk)->length() - offset; offset = 0; curr_chunk++; } - return std::make_shared<ChunkedArray>(new_chunks); + return std::make_shared<ChunkedArray>(new_chunks, type_); } std::shared_ptr<ChunkedArray> ChunkedArray::Slice(int64_t offset) const { @@ -129,15 +141,15 @@ std::shared_ptr<ChunkedArray> ChunkedArray::Slice(int64_t offset) const { Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks) : field_(field) { - data_ = std::make_shared<ChunkedArray>(chunks); + data_ = std::make_shared<ChunkedArray>(chunks, field->type()); } Column::Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data) : field_(field) { if (!data) { - data_ = std::make_shared<ChunkedArray>(ArrayVector({})); + data_ = std::make_shared<ChunkedArray>(ArrayVector({}), field->type()); } else { - data_ = std::make_shared<ChunkedArray>(ArrayVector({data})); + data_ = std::make_shared<ChunkedArray>(ArrayVector({data}), field->type()); } } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 20d027d..32af224 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -40,6 +40,7 @@ class Status; class ARROW_EXPORT ChunkedArray { public: explicit ChunkedArray(const ArrayVector& chunks); + ChunkedArray(const ArrayVector& chunks, const std::shared_ptr<DataType>& type); /// \return the total length of the chunked array; computed on construction int64_t length() const { return length_; } @@ -68,7 +69,7 @@ class ARROW_EXPORT ChunkedArray { /// \brief Slice from offset until end of the chunked array std::shared_ptr<ChunkedArray> Slice(int64_t offset) const; - std::shared_ptr<DataType> type() const; + std::shared_ptr<DataType> type() const { return type_; } bool Equals(const ChunkedArray& other) const; bool Equals(const std::shared_ptr<ChunkedArray>& other) const; @@ -77,6 +78,7 @@ class ARROW_EXPORT ChunkedArray { ArrayVector chunks_; int64_t length_; int64_t null_count_; + std::shared_ptr<DataType> type_; private: ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index ce213b9..e50760b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -140,8 +140,8 @@ class ARROW_EXPORT DataType { // Return whether the types are equal // - // Types that are logically convertable from one to another e.g. List<UInt8> - // and Binary are NOT equal). + // Types that are logically convertible from one to another (e.g. List<UInt8> + // and Binary) are NOT equal. virtual bool Equals(const DataType& other) const; bool Equals(const std::shared_ptr<DataType>& other) const; diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 8156435..5303cb2 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -44,6 +44,10 @@ def test_chunked_array_getitem(): data_slice = data[4:-1] assert data_slice.to_pylist() == [5] + data_slice = data[99:99] + assert data_slice.type == data.type + assert data_slice.to_pylist() == [] + def test_column_basics(): data = [ -- To stop receiving notification emails like this one, please contact apit...@apache.org.