Repository: arrow Updated Branches: refs/heads/master 0d1e69c23 -> 434df8af0
ARROW-1488: [C++] Implement ArrayBuilder::Finish in terms of FinishInternal based on ArrayData I don't have strong feelings about the function names here, but this was backwards compatible and will prevent extra box-then-unbox steps in analytics code that utilizes builders for accumulating outputs Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1191 from wesm/ARROW-1488 and squashes the following commits: 0c23ff53 [Wes McKinney] Rename Finish with ArrayData to FinishInternal 5313cab2 [Wes McKinney] Initial hack at refactoring Finish methods to return ArrayData Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/434df8af Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/434df8af Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/434df8af Branch: refs/heads/master Commit: 434df8af0527604303f98a23d826dad0ccc7ce12 Parents: 0d1e69c Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Thu Oct 12 12:15:44 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Thu Oct 12 12:15:44 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/builder.cc | 126 +++++++++++++++++++++--------------------- cpp/src/arrow/builder.h | 40 ++++++++------ 2 files changed, 85 insertions(+), 81 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/434df8af/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 55e7873..7152c7a 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -99,6 +99,12 @@ Status ArrayBuilder::Advance(int64_t elements) { return Status::OK(); } +Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) { + std::shared_ptr<ArrayData> internal_data; + RETURN_NOT_OK(FinishInternal(&internal_data)); + return MakeArray(internal_data, out); +} + Status ArrayBuilder::Reserve(int64_t elements) { if (length_ + elements > capacity_) { // TODO(emkornfield) power of 2 growth is potentially suboptimal @@ -213,8 +219,9 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { // ---------------------------------------------------------------------- // Null builder -Status NullBuilder::Finish(std::shared_ptr<Array>* out) { - *out = std::make_shared<NullArray>(length_); +Status NullBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { + BufferVector buffers = {nullptr}; + *out = std::make_shared<ArrayData>(null(), length_, std::move(buffers), length_); length_ = null_count_ = 0; return Status::OK(); } @@ -302,14 +309,14 @@ Status PrimitiveBuilder<T>::Append(const std::vector<value_type>& values) { } template <typename T> -Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) { +Status PrimitiveBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) { const int64_t bytes_required = TypeTraits<T>::bytes_required(length_); if (bytes_required > 0 && bytes_required < data_->size()) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } - *out = std::make_shared<typename TypeTraits<T>::ArrayType>(type_, length_, data_, - null_bitmap_, null_count_); + BufferVector buffers = {null_bitmap_, data_}; + *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -372,34 +379,36 @@ Status AdaptiveIntBuilderBase::Resize(int64_t capacity) { AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {} -Status AdaptiveIntBuilder::Finish(std::shared_ptr<Array>* out) { +Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { const int64_t bytes_required = length_ * int_size_; if (bytes_required > 0 && bytes_required < data_->size()) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } + + std::shared_ptr<DataType> output_type; switch (int_size_) { case 1: - *out = - std::make_shared<Int8Array>(int8(), length_, data_, null_bitmap_, null_count_); + output_type = int8(); break; case 2: - *out = std::make_shared<Int16Array>(int16(), length_, data_, null_bitmap_, - null_count_); + output_type = int16(); break; case 4: - *out = std::make_shared<Int32Array>(int32(), length_, data_, null_bitmap_, - null_count_); + output_type = int32(); break; case 8: - *out = std::make_shared<Int64Array>(int64(), length_, data_, null_bitmap_, - null_count_); + output_type = int64(); break; default: DCHECK(false); return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); } + BufferVector buffers = {null_bitmap_, data_}; + *out = + std::make_shared<ArrayData>(output_type, length_, std::move(buffers), null_count_); + data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return Status::OK(); @@ -529,34 +538,35 @@ Status AdaptiveIntBuilder::ExpandIntSize(uint8_t new_int_size) { AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {} -Status AdaptiveUIntBuilder::Finish(std::shared_ptr<Array>* out) { +Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { const int64_t bytes_required = length_ * int_size_; if (bytes_required > 0 && bytes_required < data_->size()) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } + std::shared_ptr<DataType> output_type; switch (int_size_) { case 1: - *out = std::make_shared<UInt8Array>(uint8(), length_, data_, null_bitmap_, - null_count_); + output_type = uint8(); break; case 2: - *out = std::make_shared<UInt16Array>(uint16(), length_, data_, null_bitmap_, - null_count_); + output_type = uint16(); break; case 4: - *out = std::make_shared<UInt32Array>(uint32(), length_, data_, null_bitmap_, - null_count_); + output_type = uint32(); break; case 8: - *out = std::make_shared<UInt64Array>(uint64(), length_, data_, null_bitmap_, - null_count_); + output_type = uint64(); break; default: DCHECK(false); return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); } + BufferVector buffers = {null_bitmap_, data_}; + *out = + std::make_shared<ArrayData>(output_type, length_, std::move(buffers), null_count_); + data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return Status::OK(); @@ -725,14 +735,15 @@ Status BooleanBuilder::Resize(int64_t capacity) { return Status::OK(); } -Status BooleanBuilder::Finish(std::shared_ptr<Array>* out) { +Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { const int64_t bytes_required = BitUtil::BytesForBits(length_); if (bytes_required > 0 && bytes_required < data_->size()) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } - *out = std::make_shared<BooleanArray>(type_, length_, data_, null_bitmap_, null_count_); + BufferVector buffers = {null_bitmap_, data_}; + *out = std::make_shared<ArrayData>(boolean(), length_, std::move(buffers), null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -861,15 +872,12 @@ Status DictionaryBuilder<T>::Resize(int64_t capacity) { } template <typename T> -Status DictionaryBuilder<T>::Finish(std::shared_ptr<Array>* out) { +Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) { std::shared_ptr<Array> dictionary; RETURN_NOT_OK(dict_builder_.Finish(&dictionary)); - std::shared_ptr<Array> values; - RETURN_NOT_OK(values_builder_.Finish(&values)); - - auto type = std::make_shared<DictionaryType>(values->type(), dictionary); - *out = std::make_shared<DictionaryArray>(type, values); + RETURN_NOT_OK(values_builder_.FinishInternal(out)); + (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary); return Status::OK(); } @@ -1104,10 +1112,12 @@ Status DecimalBuilder::Append(const Decimal128& value) { return FixedSizeBinaryBuilder::Append(value.ToBytes()); } -Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) { +Status DecimalBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { std::shared_ptr<Buffer> data; RETURN_NOT_OK(byte_builder_.Finish(&data)); - *out = std::make_shared<DecimalArray>(type_, length_, data, null_bitmap_, null_count_); + + BufferVector buffers = {null_bitmap_, data}; + *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count_); return Status::OK(); } @@ -1161,20 +1171,22 @@ Status ListBuilder::Resize(int64_t capacity) { return ArrayBuilder::Resize(capacity); } -Status ListBuilder::Finish(std::shared_ptr<Array>* out) { +Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { RETURN_NOT_OK(AppendNextOffset()); std::shared_ptr<Buffer> offsets; RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - std::shared_ptr<Array> items = values_; - if (!items) { - RETURN_NOT_OK(value_builder_->Finish(&items)); + std::shared_ptr<ArrayData> items; + if (values_) { + items = values_->data(); + } else { + RETURN_NOT_OK(value_builder_->FinishInternal(&items)); } - *out = std::make_shared<ListArray>(type_, length_, offsets, items, null_bitmap_, - null_count_); - + BufferVector buffers = {null_bitmap_, offsets}; + *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count_); + (*out)->child_data.emplace_back(std::move(items)); Reset(); return Status::OK(); } @@ -1247,13 +1259,6 @@ Status BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { BufferVector buffers = {null_bitmap_, offsets, value_data}; *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count_, 0); - return Status::OK(); -} - -Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) { - std::shared_ptr<ArrayData> data; - RETURN_NOT_OK(FinishInternal(&data)); - *out = std::make_shared<BinaryArray>(data); Reset(); return Status::OK(); } @@ -1277,14 +1282,6 @@ const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} -Status StringBuilder::Finish(std::shared_ptr<Array>* out) { - std::shared_ptr<ArrayData> data; - RETURN_NOT_OK(FinishInternal(&data)); - *out = std::make_shared<StringArray>(data); - Reset(); - return Status::OK(); -} - // ---------------------------------------------------------------------- // Fixed width binary @@ -1327,11 +1324,12 @@ Status FixedSizeBinaryBuilder::Resize(int64_t capacity) { return ArrayBuilder::Resize(capacity); } -Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>* out) { +Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { std::shared_ptr<Buffer> data; RETURN_NOT_OK(byte_builder_.Finish(&data)); - *out = std::make_shared<FixedSizeBinaryArray>(type_, length_, data, null_bitmap_, - null_count_); + + BufferVector buffers = {null_bitmap_, data}; + *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count_); return Status::OK(); } @@ -1349,17 +1347,17 @@ StructBuilder::StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* field_builders_ = std::move(field_builders); } -Status StructBuilder::Finish(std::shared_ptr<Array>* out) { - std::vector<std::shared_ptr<Array>> fields(field_builders_.size()); +Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { + BufferVector buffers = {null_bitmap_}; + *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count_); + + (*out)->child_data.resize(field_builders_.size()); for (size_t i = 0; i < field_builders_.size(); ++i) { - RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i])); + RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i])); } - *out = std::make_shared<StructArray>(type_, length_, fields, null_bitmap_, null_count_); - null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; - return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/434df8af/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 1ddcf27..54d11cf 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -108,9 +108,18 @@ class ARROW_EXPORT ArrayBuilder { std::shared_ptr<PoolBuffer> null_bitmap() const { return null_bitmap_; } - /// Creates new Array object to hold the contents of the builder and transfers - /// ownership of the data. This resets all variables on the builder. - virtual Status Finish(std::shared_ptr<Array>* out) = 0; + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0; + + /// \brief Return result of builder as an Array object. Resets builder + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr<Array>* out); std::shared_ptr<DataType> type() const { return type_; } @@ -170,7 +179,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { return Status::OK(); } - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; }; template <typename Type> @@ -228,7 +237,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { /// \return Status Status Append(const std::vector<value_type>& values); - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; Status Init(int64_t capacity) override; /// Increase the capacity of the builder to accommodate at least the indicated @@ -423,7 +432,7 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase Status Append(const uint64_t* values, int64_t length, const uint8_t* valid_bytes = nullptr); - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; protected: Status ExpandIntSize(uint8_t new_int_size); @@ -485,7 +494,7 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase Status Append(const int64_t* values, int64_t length, const uint8_t* valid_bytes = nullptr); - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; protected: Status ExpandIntSize(uint8_t new_int_size); @@ -582,7 +591,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { /// \return Status Status Append(const std::vector<bool>& values); - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; Status Init(int64_t capacity) override; /// Increase the capacity of the builder to accommodate at least the indicated @@ -619,7 +628,7 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; /// \brief Vector append /// @@ -673,7 +682,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } @@ -690,7 +699,6 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1; Status AppendNextOffset(); - Status FinishInternal(std::shared_ptr<ArrayData>* out); void Reset(); }; @@ -703,8 +711,6 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { using BinaryBuilder::Append; - Status Finish(std::shared_ptr<Array>* out) override; - Status Append(const std::vector<std::string>& values, uint8_t* null_bytes); }; @@ -732,7 +738,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; /// \return size of values buffer so far int64_t value_data_length() const { return byte_builder_.length(); } @@ -756,7 +762,7 @@ class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { Status Append(const Decimal128& val); - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; }; // ---------------------------------------------------------------------- @@ -772,7 +778,7 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool, std::vector<std::unique_ptr<ArrayBuilder>>&& field_builders); - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; /// Null bitmap is of equal length to every child field, and any zero byte /// will be considered as a null for that field, but users must using app- @@ -875,7 +881,7 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status Finish(std::shared_ptr<Array>* out) override; + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; protected: Status DoubleTableSize();