Repository: arrow
Updated Branches:
  refs/heads/master 0d1e69c23 -> 434df8af0


ARROW-1488: [C++] Implement ArrayBuilder::Finish in terms of FinishInternal 
based on ArrayData

I don't have strong feelings about the function names here, but this was 
backwards compatible and will prevent extra box-then-unbox steps in analytics 
code that utilizes builders for accumulating outputs

Author: Wes McKinney <wes.mckin...@twosigma.com>

Closes #1191 from wesm/ARROW-1488 and squashes the following commits:

0c23ff53 [Wes McKinney] Rename Finish with ArrayData to FinishInternal
5313cab2 [Wes McKinney] Initial hack at refactoring Finish methods to return 
ArrayData


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/434df8af
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/434df8af
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/434df8af

Branch: refs/heads/master
Commit: 434df8af0527604303f98a23d826dad0ccc7ce12
Parents: 0d1e69c
Author: Wes McKinney <wes.mckin...@twosigma.com>
Authored: Thu Oct 12 12:15:44 2017 -0400
Committer: Wes McKinney <wes.mckin...@twosigma.com>
Committed: Thu Oct 12 12:15:44 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/builder.cc | 126 +++++++++++++++++++++---------------------
 cpp/src/arrow/builder.h  |  40 ++++++++------
 2 files changed, 85 insertions(+), 81 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/434df8af/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 55e7873..7152c7a 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -99,6 +99,12 @@ Status ArrayBuilder::Advance(int64_t elements) {
   return Status::OK();
 }
 
+Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) {
+  std::shared_ptr<ArrayData> internal_data;
+  RETURN_NOT_OK(FinishInternal(&internal_data));
+  return MakeArray(internal_data, out);
+}
+
 Status ArrayBuilder::Reserve(int64_t elements) {
   if (length_ + elements > capacity_) {
     // TODO(emkornfield) power of 2 growth is potentially suboptimal
@@ -213,8 +219,9 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
 // ----------------------------------------------------------------------
 // Null builder
 
-Status NullBuilder::Finish(std::shared_ptr<Array>* out) {
-  *out = std::make_shared<NullArray>(length_);
+Status NullBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  BufferVector buffers = {nullptr};
+  *out = std::make_shared<ArrayData>(null(), length_, std::move(buffers), 
length_);
   length_ = null_count_ = 0;
   return Status::OK();
 }
@@ -302,14 +309,14 @@ Status PrimitiveBuilder<T>::Append(const 
std::vector<value_type>& values) {
 }
 
 template <typename T>
-Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) {
+Status PrimitiveBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
   const int64_t bytes_required = TypeTraits<T>::bytes_required(length_);
   if (bytes_required > 0 && bytes_required < data_->size()) {
     // Trim buffers
     RETURN_NOT_OK(data_->Resize(bytes_required));
   }
-  *out = std::make_shared<typename TypeTraits<T>::ArrayType>(type_, length_, 
data_,
-                                                             null_bitmap_, 
null_count_);
+  BufferVector buffers = {null_bitmap_, data_};
+  *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), 
null_count_);
 
   data_ = null_bitmap_ = nullptr;
   capacity_ = length_ = null_count_ = 0;
@@ -372,34 +379,36 @@ Status AdaptiveIntBuilderBase::Resize(int64_t capacity) {
 
 AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : 
AdaptiveIntBuilderBase(pool) {}
 
-Status AdaptiveIntBuilder::Finish(std::shared_ptr<Array>* out) {
+Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   const int64_t bytes_required = length_ * int_size_;
   if (bytes_required > 0 && bytes_required < data_->size()) {
     // Trim buffers
     RETURN_NOT_OK(data_->Resize(bytes_required));
   }
+
+  std::shared_ptr<DataType> output_type;
   switch (int_size_) {
     case 1:
-      *out =
-          std::make_shared<Int8Array>(int8(), length_, data_, null_bitmap_, 
null_count_);
+      output_type = int8();
       break;
     case 2:
-      *out = std::make_shared<Int16Array>(int16(), length_, data_, 
null_bitmap_,
-                                          null_count_);
+      output_type = int16();
       break;
     case 4:
-      *out = std::make_shared<Int32Array>(int32(), length_, data_, 
null_bitmap_,
-                                          null_count_);
+      output_type = int32();
       break;
     case 8:
-      *out = std::make_shared<Int64Array>(int64(), length_, data_, 
null_bitmap_,
-                                          null_count_);
+      output_type = int64();
       break;
     default:
       DCHECK(false);
       return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
   }
 
+  BufferVector buffers = {null_bitmap_, data_};
+  *out =
+      std::make_shared<ArrayData>(output_type, length_, std::move(buffers), 
null_count_);
+
   data_ = null_bitmap_ = nullptr;
   capacity_ = length_ = null_count_ = 0;
   return Status::OK();
@@ -529,34 +538,35 @@ Status AdaptiveIntBuilder::ExpandIntSize(uint8_t 
new_int_size) {
 AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool)
     : AdaptiveIntBuilderBase(pool) {}
 
-Status AdaptiveUIntBuilder::Finish(std::shared_ptr<Array>* out) {
+Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   const int64_t bytes_required = length_ * int_size_;
   if (bytes_required > 0 && bytes_required < data_->size()) {
     // Trim buffers
     RETURN_NOT_OK(data_->Resize(bytes_required));
   }
+  std::shared_ptr<DataType> output_type;
   switch (int_size_) {
     case 1:
-      *out = std::make_shared<UInt8Array>(uint8(), length_, data_, 
null_bitmap_,
-                                          null_count_);
+      output_type = uint8();
       break;
     case 2:
-      *out = std::make_shared<UInt16Array>(uint16(), length_, data_, 
null_bitmap_,
-                                           null_count_);
+      output_type = uint16();
       break;
     case 4:
-      *out = std::make_shared<UInt32Array>(uint32(), length_, data_, 
null_bitmap_,
-                                           null_count_);
+      output_type = uint32();
       break;
     case 8:
-      *out = std::make_shared<UInt64Array>(uint64(), length_, data_, 
null_bitmap_,
-                                           null_count_);
+      output_type = uint64();
       break;
     default:
       DCHECK(false);
       return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
   }
 
+  BufferVector buffers = {null_bitmap_, data_};
+  *out =
+      std::make_shared<ArrayData>(output_type, length_, std::move(buffers), 
null_count_);
+
   data_ = null_bitmap_ = nullptr;
   capacity_ = length_ = null_count_ = 0;
   return Status::OK();
@@ -725,14 +735,15 @@ Status BooleanBuilder::Resize(int64_t capacity) {
   return Status::OK();
 }
 
-Status BooleanBuilder::Finish(std::shared_ptr<Array>* out) {
+Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   const int64_t bytes_required = BitUtil::BytesForBits(length_);
 
   if (bytes_required > 0 && bytes_required < data_->size()) {
     // Trim buffers
     RETURN_NOT_OK(data_->Resize(bytes_required));
   }
-  *out = std::make_shared<BooleanArray>(type_, length_, data_, null_bitmap_, 
null_count_);
+  BufferVector buffers = {null_bitmap_, data_};
+  *out = std::make_shared<ArrayData>(boolean(), length_, std::move(buffers), 
null_count_);
 
   data_ = null_bitmap_ = nullptr;
   capacity_ = length_ = null_count_ = 0;
@@ -861,15 +872,12 @@ Status DictionaryBuilder<T>::Resize(int64_t capacity) {
 }
 
 template <typename T>
-Status DictionaryBuilder<T>::Finish(std::shared_ptr<Array>* out) {
+Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
   std::shared_ptr<Array> dictionary;
   RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
 
-  std::shared_ptr<Array> values;
-  RETURN_NOT_OK(values_builder_.Finish(&values));
-
-  auto type = std::make_shared<DictionaryType>(values->type(), dictionary);
-  *out = std::make_shared<DictionaryArray>(type, values);
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
   return Status::OK();
 }
 
@@ -1104,10 +1112,12 @@ Status DecimalBuilder::Append(const Decimal128& value) {
   return FixedSizeBinaryBuilder::Append(value.ToBytes());
 }
 
-Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) {
+Status DecimalBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   std::shared_ptr<Buffer> data;
   RETURN_NOT_OK(byte_builder_.Finish(&data));
-  *out = std::make_shared<DecimalArray>(type_, length_, data, null_bitmap_, 
null_count_);
+
+  BufferVector buffers = {null_bitmap_, data};
+  *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), 
null_count_);
   return Status::OK();
 }
 
@@ -1161,20 +1171,22 @@ Status ListBuilder::Resize(int64_t capacity) {
   return ArrayBuilder::Resize(capacity);
 }
 
-Status ListBuilder::Finish(std::shared_ptr<Array>* out) {
+Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   RETURN_NOT_OK(AppendNextOffset());
 
   std::shared_ptr<Buffer> offsets;
   RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
 
-  std::shared_ptr<Array> items = values_;
-  if (!items) {
-    RETURN_NOT_OK(value_builder_->Finish(&items));
+  std::shared_ptr<ArrayData> items;
+  if (values_) {
+    items = values_->data();
+  } else {
+    RETURN_NOT_OK(value_builder_->FinishInternal(&items));
   }
 
-  *out = std::make_shared<ListArray>(type_, length_, offsets, items, 
null_bitmap_,
-                                     null_count_);
-
+  BufferVector buffers = {null_bitmap_, offsets};
+  *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), 
null_count_);
+  (*out)->child_data.emplace_back(std::move(items));
   Reset();
   return Status::OK();
 }
@@ -1247,13 +1259,6 @@ Status 
BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
 
   BufferVector buffers = {null_bitmap_, offsets, value_data};
   *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), 
null_count_, 0);
-  return Status::OK();
-}
-
-Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
-  std::shared_ptr<ArrayData> data;
-  RETURN_NOT_OK(FinishInternal(&data));
-  *out = std::make_shared<BinaryArray>(data);
   Reset();
   return Status::OK();
 }
@@ -1277,14 +1282,6 @@ const uint8_t* BinaryBuilder::GetValue(int64_t i, 
int32_t* out_length) const {
 
 StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {}
 
-Status StringBuilder::Finish(std::shared_ptr<Array>* out) {
-  std::shared_ptr<ArrayData> data;
-  RETURN_NOT_OK(FinishInternal(&data));
-  *out = std::make_shared<StringArray>(data);
-  Reset();
-  return Status::OK();
-}
-
 // ----------------------------------------------------------------------
 // Fixed width binary
 
@@ -1327,11 +1324,12 @@ Status FixedSizeBinaryBuilder::Resize(int64_t capacity) 
{
   return ArrayBuilder::Resize(capacity);
 }
 
-Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>* out) {
+Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) 
{
   std::shared_ptr<Buffer> data;
   RETURN_NOT_OK(byte_builder_.Finish(&data));
-  *out = std::make_shared<FixedSizeBinaryArray>(type_, length_, data, 
null_bitmap_,
-                                                null_count_);
+
+  BufferVector buffers = {null_bitmap_, data};
+  *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), 
null_count_);
   return Status::OK();
 }
 
@@ -1349,17 +1347,17 @@ StructBuilder::StructBuilder(const 
std::shared_ptr<DataType>& type, MemoryPool*
   field_builders_ = std::move(field_builders);
 }
 
-Status StructBuilder::Finish(std::shared_ptr<Array>* out) {
-  std::vector<std::shared_ptr<Array>> fields(field_builders_.size());
+Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  BufferVector buffers = {null_bitmap_};
+  *out = std::make_shared<ArrayData>(type_, length_, std::move(buffers), 
null_count_);
+
+  (*out)->child_data.resize(field_builders_.size());
   for (size_t i = 0; i < field_builders_.size(); ++i) {
-    RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i]));
+    RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i]));
   }
 
-  *out = std::make_shared<StructArray>(type_, length_, fields, null_bitmap_, 
null_count_);
-
   null_bitmap_ = nullptr;
   capacity_ = length_ = null_count_ = 0;
-
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/434df8af/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 1ddcf27..54d11cf 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -108,9 +108,18 @@ class ARROW_EXPORT ArrayBuilder {
 
   std::shared_ptr<PoolBuffer> null_bitmap() const { return null_bitmap_; }
 
-  /// Creates new Array object to hold the contents of the builder and 
transfers
-  /// ownership of the data.  This resets all variables on the builder.
-  virtual Status Finish(std::shared_ptr<Array>* out) = 0;
+  /// \brief Return result of builder as an internal generic ArrayData
+  /// object. Resets builder
+  ///
+  /// \param[out] out the finalized ArrayData object
+  /// \return Status
+  virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
+
+  /// \brief Return result of builder as an Array object. Resets builder
+  ///
+  /// \param[out] out the finalized Array object
+  /// \return Status
+  Status Finish(std::shared_ptr<Array>* out);
 
   std::shared_ptr<DataType> type() const { return type_; }
 
@@ -170,7 +179,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
     return Status::OK();
   }
 
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 };
 
 template <typename Type>
@@ -228,7 +237,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
   /// \return Status
   Status Append(const std::vector<value_type>& values);
 
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
   Status Init(int64_t capacity) override;
 
   /// Increase the capacity of the builder to accommodate at least the 
indicated
@@ -423,7 +432,7 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public 
internal::AdaptiveIntBuilderBase
   Status Append(const uint64_t* values, int64_t length,
                 const uint8_t* valid_bytes = nullptr);
 
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
  protected:
   Status ExpandIntSize(uint8_t new_int_size);
@@ -485,7 +494,7 @@ class ARROW_EXPORT AdaptiveIntBuilder : public 
internal::AdaptiveIntBuilderBase
   Status Append(const int64_t* values, int64_t length,
                 const uint8_t* valid_bytes = nullptr);
 
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
  protected:
   Status ExpandIntSize(uint8_t new_int_size);
@@ -582,7 +591,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
   /// \return Status
   Status Append(const std::vector<bool>& values);
 
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
   Status Init(int64_t capacity) override;
 
   /// Increase the capacity of the builder to accommodate at least the 
indicated
@@ -619,7 +628,7 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder {
 
   Status Init(int64_t elements) override;
   Status Resize(int64_t capacity) override;
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
   /// \brief Vector append
   ///
@@ -673,7 +682,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
 
   Status Init(int64_t elements) override;
   Status Resize(int64_t capacity) override;
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
   /// \return size of values buffer so far
   int64_t value_data_length() const { return value_data_builder_.length(); }
@@ -690,7 +699,6 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
   static constexpr int64_t kMaximumCapacity = 
std::numeric_limits<int32_t>::max() - 1;
 
   Status AppendNextOffset();
-  Status FinishInternal(std::shared_ptr<ArrayData>* out);
   void Reset();
 };
 
@@ -703,8 +711,6 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder {
 
   using BinaryBuilder::Append;
 
-  Status Finish(std::shared_ptr<Array>* out) override;
-
   Status Append(const std::vector<std::string>& values, uint8_t* null_bytes);
 };
 
@@ -732,7 +738,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public 
ArrayBuilder {
 
   Status Init(int64_t elements) override;
   Status Resize(int64_t capacity) override;
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
   /// \return size of values buffer so far
   int64_t value_data_length() const { return byte_builder_.length(); }
@@ -756,7 +762,7 @@ class ARROW_EXPORT DecimalBuilder : public 
FixedSizeBinaryBuilder {
 
   Status Append(const Decimal128& val);
 
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 };
 
 // ----------------------------------------------------------------------
@@ -772,7 +778,7 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
   StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
                 std::vector<std::unique_ptr<ArrayBuilder>>&& field_builders);
 
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
   /// Null bitmap is of equal length to every child field, and any zero byte
   /// will be considered as a null for that field, but users must using app-
@@ -875,7 +881,7 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
 
   Status Init(int64_t elements) override;
   Status Resize(int64_t capacity) override;
-  Status Finish(std::shared_ptr<Array>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
  protected:
   Status DoubleTableSize();

Reply via email to