This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new bbca717 ARROW-4341: [C++] Refactor Primitive builders and
BooleanBuilder to use TypedBufferBuilder<T>
bbca717 is described below
commit bbca7178adf1214425f95164a887989b018707f8
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Sat Feb 16 17:44:46 2019 -0600
ARROW-4341: [C++] Refactor Primitive builders and BooleanBuilder to use
TypedBufferBuilder<T>
This reduces code duplication.
Author: Benjamin Kietzman <[email protected]>
Author: Wes McKinney <[email protected]>
Closes #3575 from bkietz/ARROW-4341-primitive-builders-use-bufferbuilder
and squashes the following commits:
3ef29721b <Wes McKinney> Fix BooleanBuilder::AppendNulls, remove
valid_bytes argument from AppendNulls methods
40c4d8d5c <Benjamin Kietzman> TypedBufferBuilder<bool>'s output was not
correctly sized
b389c1308 <Wes McKinney> Revert changes to arrow/util/logging.h
daf524423 <Wes McKinney> Revert change to UnsafeAppend that broke Python
unit test
3cc5a0c46 <Wes McKinney> Restore memory zeroing. Add missing override
21ce28558 <Wes McKinney> Fix RETURN_NOT_OK usages
d4ab3b539 <Wes McKinney> Move NumericBuilder implementation to headers to
avoid symbol visibility concerns
6c1e99d4e <Wes McKinney> Add TypedBufferBuilder<bool> UnsafeAppend
compile-time option to not track falses. Restore faster code from before this
patch for appending C arrays and vector<bool>
09d2bfe8f <Benjamin Kietzman> reduce unnecessary zeroing in BufferBuilder
bd736c3db <Benjamin Kietzman> add ArrowLogIgnore and use for release mode
DCHECK*
7ba692cc9 <Benjamin Kietzman> moving to iterator append in NumericBuilder
188b7b9cc <Benjamin Kietzman> fix format
893457306 <Benjamin Kietzman> add explicit cast
88e57fe58 <Benjamin Kietzman> remove PrimitiveBuilder
9c050b4dc <Benjamin Kietzman> Use TypedBufferBuilder for PrimitiveBuilder
078497a38 <Benjamin Kietzman> fix BooleanBuilder::AppendNull
88eb71c97 <Benjamin Kietzman> Use TypedBufferBuilder<bool> in BooleanBuilder
---
c_glib/arrow-glib/array-builder.cpp | 4 +-
cpp/src/arrow/array-test.cc | 43 ++++--
cpp/src/arrow/array/builder_adaptive.h | 7 +-
cpp/src/arrow/array/builder_base.cc | 6 +
cpp/src/arrow/array/builder_base.h | 2 +
cpp/src/arrow/array/builder_primitive.cc | 169 ++----------------------
cpp/src/arrow/array/builder_primitive.h | 217 +++++++++++++++----------------
cpp/src/arrow/buffer-builder.h | 40 +++++-
cpp/src/arrow/buffer-test.cc | 4 +
9 files changed, 201 insertions(+), 291 deletions(-)
diff --git a/c_glib/arrow-glib/array-builder.cpp
b/c_glib/arrow-glib/array-builder.cpp
index 095c68d..afdae8c 100644
--- a/c_glib/arrow-glib/array-builder.cpp
+++ b/c_glib/arrow-glib/array-builder.cpp
@@ -136,9 +136,7 @@ garrow_array_builder_append_nulls(GArrowArrayBuilder
*builder,
auto arrow_builder =
static_cast<BUILDER>(garrow_array_builder_get_raw(builder));
- uint8_t valid_bytes[n];
- memset(valid_bytes, 0, sizeof(uint8_t) * n);
- auto status = arrow_builder->AppendNulls(valid_bytes, n);
+ auto status = arrow_builder->AppendNulls(n);
return garrow_error_check(error, status, context);
}
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 3b348de..dead8de 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -478,7 +478,11 @@ void TestPrimitiveBuilder<PBoolean>::Check(const
std::unique_ptr<BooleanBuilder>
ASSERT_EQ(draws_[i] != 0, actual) << i;
}
}
- ASSERT_TRUE(result->Equals(*expected));
+ AssertArraysEqual(*result, *expected);
+
+ // buffers are correctly sized
+ ASSERT_EQ(result->data()->buffers[0]->size(), BitUtil::BytesForBits(size));
+ ASSERT_EQ(result->data()->buffers[1]->size(), BitUtil::BytesForBits(size));
// Builder is now reset
ASSERT_EQ(0, builder->length());
@@ -518,15 +522,13 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) {
TYPED_TEST(TestPrimitiveBuilder, TestAppendNulls) {
const int64_t size = 10;
- const uint8_t valid_bytes[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
-
- ASSERT_OK(this->builder_->AppendNulls(valid_bytes, size));
+ ASSERT_OK(this->builder_->AppendNulls(size));
std::shared_ptr<Array> result;
FinishAndCheckPadding(this->builder_.get(), &result);
for (int64_t i = 0; i < size; ++i) {
- ASSERT_EQ(result->IsValid(i), static_cast<bool>(valid_bytes[i]));
+ ASSERT_FALSE(result->IsValid(i));
}
}
@@ -922,6 +924,27 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) {
ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100),
this->builder_->capacity());
}
+TEST(TestBooleanBuilder, AppendNullsAdvanceBuilder) {
+ BooleanBuilder builder;
+
+ std::vector<uint8_t> values = {1, 0, 0, 1};
+ std::vector<uint8_t> is_valid = {1, 1, 0, 1};
+
+ std::shared_ptr<Array> arr;
+ ASSERT_OK(builder.AppendValues(values.data(), 2));
+ ASSERT_OK(builder.AppendNulls(1));
+ ASSERT_OK(builder.AppendValues(values.data() + 3, 1));
+ ASSERT_OK(builder.Finish(&arr));
+
+ ASSERT_EQ(1, arr->null_count());
+
+ const auto& barr = static_cast<const BooleanArray&>(*arr);
+ ASSERT_TRUE(barr.Value(0));
+ ASSERT_FALSE(barr.Value(1));
+ ASSERT_TRUE(barr.IsNull(2));
+ ASSERT_TRUE(barr.Value(3));
+}
+
TEST(TestBooleanBuilder, TestStdBoolVectorAppend) {
BooleanBuilder builder;
BooleanBuilder builder_nn;
@@ -1391,13 +1414,12 @@ TEST_F(TestAdaptiveIntBuilder, TestAppendNull) {
TEST_F(TestAdaptiveIntBuilder, TestAppendNulls) {
constexpr int64_t size = 10;
- const uint8_t valid_bytes[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- ASSERT_OK(builder_->AppendNulls(valid_bytes, size));
+ ASSERT_OK(builder_->AppendNulls(size));
Done();
for (unsigned index = 0; index < size; ++index) {
- ASSERT_EQ(result_->IsValid(index), static_cast<bool>(valid_bytes[index]));
+ ASSERT_FALSE(result_->IsValid(index));
}
}
@@ -1526,13 +1548,12 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendNull) {
TEST_F(TestAdaptiveUIntBuilder, TestAppendNulls) {
constexpr int64_t size = 10;
- const uint8_t valid_bytes[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- ASSERT_OK(builder_->AppendNulls(valid_bytes, size));
+ ASSERT_OK(builder_->AppendNulls(size));
Done();
for (unsigned index = 0; index < size; ++index) {
- ASSERT_EQ(result_->IsValid(index), static_cast<bool>(valid_bytes[index]));
+ ASSERT_FALSE(result_->IsValid(index));
}
}
diff --git a/cpp/src/arrow/array/builder_adaptive.h
b/cpp/src/arrow/array/builder_adaptive.h
index 6523de4..afbfca2 100644
--- a/cpp/src/arrow/array/builder_adaptive.h
+++ b/cpp/src/arrow/array/builder_adaptive.h
@@ -29,12 +29,13 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public
ArrayBuilder {
public:
explicit AdaptiveIntBuilderBase(MemoryPool* pool);
- /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated
memory
- Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
+ /// \brief Append multiple nulls
+ /// \param[in] length the number of nulls to append
+ Status AppendNulls(int64_t length) {
ARROW_RETURN_NOT_OK(CommitPendingData());
ARROW_RETURN_NOT_OK(Reserve(length));
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
- UnsafeAppendToBitmap(valid_bytes, length);
+ UnsafeSetNull(length);
return Status::OK();
}
diff --git a/cpp/src/arrow/array/builder_base.cc
b/cpp/src/arrow/array/builder_base.cc
index e805900..75baedd 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -106,4 +106,10 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
null_bitmap_builder_.UnsafeAppend(length, true);
}
+void ArrayBuilder::UnsafeSetNull(int64_t length) {
+ length_ += length;
+ null_count_ += length;
+ null_bitmap_builder_.UnsafeAppend(length, false);
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/array/builder_base.h
b/cpp/src/arrow/array/builder_base.h
index f4655fa..ebe1ece 100644
--- a/cpp/src/arrow/array/builder_base.h
+++ b/cpp/src/arrow/array/builder_base.h
@@ -162,6 +162,8 @@ class ARROW_EXPORT ArrayBuilder {
// Set the next length bits to not null (i.e. valid).
void UnsafeSetNotNull(int64_t length);
+ void UnsafeSetNull(int64_t length);
+
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer*
buffer);
static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) {
diff --git a/cpp/src/arrow/array/builder_primitive.cc
b/cpp/src/arrow/array/builder_primitive.cc
index a593f36..13e8f2e 100644
--- a/cpp/src/arrow/array/builder_primitive.cc
+++ b/cpp/src/arrow/array/builder_primitive.cc
@@ -45,104 +45,8 @@ Status
NullBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
return Status::OK();
}
-// ----------------------------------------------------------------------
-
-template <typename T>
-void PrimitiveBuilder<T>::Reset() {
- data_.reset();
- raw_data_ = nullptr;
-}
-
-template <typename T>
-Status PrimitiveBuilder<T>::Resize(int64_t capacity) {
- RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
- capacity = std::max(capacity, kMinBuilderCapacity);
-
- int64_t nbytes = TypeTraits<T>::bytes_required(capacity);
- if (capacity_ == 0) {
- RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_));
- } else {
- RETURN_NOT_OK(data_->Resize(nbytes));
- }
-
- raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data());
- return ArrayBuilder::Resize(capacity);
-}
-
-template <typename T>
-Status PrimitiveBuilder<T>::AppendValues(const value_type* values, int64_t
length,
- const uint8_t* valid_bytes) {
- RETURN_NOT_OK(Reserve(length));
-
- if (length > 0) {
- std::memcpy(raw_data_ + length_, values,
-
static_cast<std::size_t>(TypeTraits<T>::bytes_required(length)));
- }
-
- // length_ is update by these
- ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
- return Status::OK();
-}
-
-template <typename T>
-Status PrimitiveBuilder<T>::AppendValues(const value_type* values, int64_t
length,
- const std::vector<bool>& is_valid) {
- RETURN_NOT_OK(Reserve(length));
- DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));
-
- if (length > 0) {
- std::memcpy(raw_data_ + length_, values,
-
static_cast<std::size_t>(TypeTraits<T>::bytes_required(length)));
- }
-
- // length_ is update by these
- ArrayBuilder::UnsafeAppendToBitmap(is_valid);
- return Status::OK();
-}
-
-template <typename T>
-Status PrimitiveBuilder<T>::AppendValues(const std::vector<value_type>& values,
- const std::vector<bool>& is_valid) {
- return AppendValues(values.data(), static_cast<int64_t>(values.size()),
is_valid);
-}
-
-template <typename T>
-Status PrimitiveBuilder<T>::AppendValues(const std::vector<value_type>&
values) {
- return AppendValues(values.data(), static_cast<int64_t>(values.size()));
-}
-
-template <typename T>
-Status PrimitiveBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
- RETURN_NOT_OK(TrimBuffer(TypeTraits<T>::bytes_required(length_),
data_.get()));
- std::shared_ptr<Buffer> null_bitmap;
- RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
- *out = ArrayData::Make(type_, length_, {null_bitmap, data_}, null_count_);
-
- data_ = nullptr;
- capacity_ = length_ = null_count_ = 0;
-
- return Status::OK();
-}
-
-template class PrimitiveBuilder<UInt8Type>;
-template class PrimitiveBuilder<UInt16Type>;
-template class PrimitiveBuilder<UInt32Type>;
-template class PrimitiveBuilder<UInt64Type>;
-template class PrimitiveBuilder<Int8Type>;
-template class PrimitiveBuilder<Int16Type>;
-template class PrimitiveBuilder<Int32Type>;
-template class PrimitiveBuilder<Int64Type>;
-template class PrimitiveBuilder<Date32Type>;
-template class PrimitiveBuilder<Date64Type>;
-template class PrimitiveBuilder<Time32Type>;
-template class PrimitiveBuilder<Time64Type>;
-template class PrimitiveBuilder<TimestampType>;
-template class PrimitiveBuilder<HalfFloatType>;
-template class PrimitiveBuilder<FloatType>;
-template class PrimitiveBuilder<DoubleType>;
-
BooleanBuilder::BooleanBuilder(MemoryPool* pool)
- : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {}
+ : ArrayBuilder(boolean(), pool), data_builder_(pool) {}
BooleanBuilder::BooleanBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool)
: BooleanBuilder(pool) {
@@ -151,57 +55,23 @@ BooleanBuilder::BooleanBuilder(const
std::shared_ptr<DataType>& type, MemoryPool
void BooleanBuilder::Reset() {
ArrayBuilder::Reset();
- data_.reset();
- raw_data_ = nullptr;
+ data_builder_.Reset();
}
Status BooleanBuilder::Resize(int64_t capacity) {
RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
capacity = std::max(capacity, kMinBuilderCapacity);
-
- const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity);
- if (capacity_ == 0) {
- RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_));
- raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
-
- // We zero the memory for booleans to keep things simple; for some reason
if
- // we do not, even though we may write every bit (through in-place | or &),
- // valgrind will still show a warning. If we do not zero the bytes here, we
- // will have to be careful to zero them in AppendNull and AppendNulls.
Also,
- // zeroing the bits results in deterministic bits when each byte may have a
- // mix of nulls and not nulls.
- //
- // We only zero up to new_bitmap_size because the padding was zeroed by
- // AllocateResizableBuffer
- memset(raw_data_, 0, static_cast<size_t>(new_bitmap_size));
- } else {
- const int64_t old_bitmap_capacity = data_->capacity();
- RETURN_NOT_OK(data_->Resize(new_bitmap_size));
- const int64_t new_bitmap_capacity = data_->capacity();
- raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
-
- // See comment above about why we zero memory for booleans
- memset(raw_data_ + old_bitmap_capacity, 0,
- static_cast<size_t>(new_bitmap_capacity - old_bitmap_capacity));
- }
-
+ RETURN_NOT_OK(data_builder_.Resize(capacity));
return ArrayBuilder::Resize(capacity);
}
Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- int64_t bit_offset = length_ % 8;
- if (bit_offset > 0) {
- // Adjust last byte
- data_->mutable_data()[length_ / 8] &=
BitUtil::kPrecedingBitmask[bit_offset];
- }
-
- std::shared_ptr<Buffer> null_bitmap;
+ std::shared_ptr<Buffer> data, null_bitmap;
+ RETURN_NOT_OK(data_builder_.Finish(&data));
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
- RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get()));
- *out = ArrayData::Make(boolean(), length_, {null_bitmap, data_},
null_count_);
+ *out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_);
- data_ = nullptr;
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
@@ -211,10 +81,8 @@ Status BooleanBuilder::AppendValues(const uint8_t* values,
int64_t length,
RETURN_NOT_OK(Reserve(length));
int64_t i = 0;
- internal::GenerateBitsUnrolled(raw_data_, length_, length,
- [values, &i]() -> bool { return values[i++]
!= 0; });
-
- // this updates length_
+ data_builder_.UnsafeAppend<false>(length,
+ [values, &i]() -> bool { return
values[i++] != 0; });
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
@@ -223,12 +91,9 @@ Status BooleanBuilder::AppendValues(const uint8_t* values,
int64_t length,
const std::vector<bool>& is_valid) {
RETURN_NOT_OK(Reserve(length));
DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));
-
int64_t i = 0;
- internal::GenerateBitsUnrolled(raw_data_, length_, length,
- [values, &i]() -> bool { return values[i++];
});
-
- // this updates length_
+ data_builder_.UnsafeAppend<false>(length,
+ [values, &i]() -> bool { return
values[i++]; });
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
@@ -247,12 +112,9 @@ Status BooleanBuilder::AppendValues(const
std::vector<bool>& values,
const int64_t length = static_cast<int64_t>(values.size());
RETURN_NOT_OK(Reserve(length));
DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));
-
int64_t i = 0;
- internal::GenerateBitsUnrolled(raw_data_, length_, length,
- [&values, &i]() -> bool { return values[i++];
});
-
- // this updates length_
+ data_builder_.UnsafeAppend<false>(length,
+ [&values, &i]() -> bool { return
values[i++]; });
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
@@ -260,12 +122,9 @@ Status BooleanBuilder::AppendValues(const
std::vector<bool>& values,
Status BooleanBuilder::AppendValues(const std::vector<bool>& values) {
const int64_t length = static_cast<int64_t>(values.size());
RETURN_NOT_OK(Reserve(length));
-
int64_t i = 0;
- internal::GenerateBitsUnrolled(raw_data_, length_, length,
- [&values, &i]() -> bool { return values[i++];
});
-
- // this updates length_
+ data_builder_.UnsafeAppend<false>(length,
+ [&values, &i]() -> bool { return
values[i++]; });
ArrayBuilder::UnsafeSetNotNull(length);
return Status::OK();
}
diff --git a/cpp/src/arrow/array/builder_primitive.h
b/cpp/src/arrow/array/builder_primitive.h
index bf3ec91..5a9b694 100644
--- a/cpp/src/arrow/array/builder_primitive.h
+++ b/cpp/src/arrow/array/builder_primitive.h
@@ -21,6 +21,7 @@
#include <memory>
#include <vector>
+#include "arrow/array.h"
#include "arrow/array/builder_base.h"
#include "arrow/type.h"
@@ -42,37 +43,53 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
};
-template <typename Type>
-class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
+/// Base class for all Builders that emit an Array of a scalar numerical type.
+template <typename T>
+class NumericBuilder : public ArrayBuilder {
public:
- using value_type = typename Type::c_type;
+ using value_type = typename T::c_type;
+ using ArrayBuilder::ArrayBuilder;
- explicit PrimitiveBuilder(const std::shared_ptr<DataType>& type, MemoryPool*
pool)
- : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {}
+ template <typename T1 = T>
+ explicit NumericBuilder(
+ typename std::enable_if<TypeTraits<T1>::is_parameter_free,
MemoryPool*>::type pool
+ ARROW_MEMORY_POOL_DEFAULT)
+ : ArrayBuilder(TypeTraits<T1>::type_singleton(), pool) {}
- using ArrayBuilder::Advance;
+ /// Append a single scalar and increase the size if necessary.
+ Status Append(const value_type val) {
+ ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
+ UnsafeAppend(val);
+ return Status::OK();
+ }
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated
memory
/// The memory at the corresponding data slot is set to 0 to prevent
/// uninitialized memory access
- Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
+ Status AppendNulls(int64_t length) {
ARROW_RETURN_NOT_OK(Reserve(length));
- memset(raw_data_ + length_, 0,
- static_cast<size_t>(TypeTraits<Type>::bytes_required(length)));
- UnsafeAppendToBitmap(valid_bytes, length);
+ data_builder_.UnsafeAppend(length, static_cast<value_type>(0));
+ UnsafeSetNull(length);
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() {
ARROW_RETURN_NOT_OK(Reserve(1));
- memset(raw_data_ + length_, 0, sizeof(value_type));
+ data_builder_.UnsafeAppend(static_cast<value_type>(0));
UnsafeAppendToBitmap(false);
return Status::OK();
}
- value_type GetValue(int64_t index) const {
- return reinterpret_cast<const value_type*>(data_->data())[index];
+ value_type GetValue(int64_t index) const { return
data_builder_.data()[index]; }
+
+ void Reset() override { data_builder_.Reset(); }
+
+ Status Resize(int64_t capacity) override {
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
+ capacity = std::max(capacity, kMinBuilderCapacity);
+ ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
+ return ArrayBuilder::Resize(capacity);
}
/// \brief Append a sequence of elements in one shot
@@ -82,7 +99,13 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
- const uint8_t* valid_bytes = NULLPTR);
+ const uint8_t* valid_bytes = NULLPTR) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(values, length);
+ // length_ is update by these
+ ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
+ return Status::OK();
+ }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
@@ -91,7 +114,13 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
- const std::vector<bool>& is_valid);
+ const std::vector<bool>& is_valid) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(values, length);
+ // length_ is update by these
+ ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+ }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
@@ -99,25 +128,35 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<value_type>& values,
- const std::vector<bool>& is_valid);
+ const std::vector<bool>& is_valid) {
+ return AppendValues(values.data(), static_cast<int64_t>(values.size()),
is_valid);
+ }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
/// \return Status
- Status AppendValues(const std::vector<value_type>& values);
+ Status AppendValues(const std::vector<value_type>& values) {
+ return AppendValues(values.data(), static_cast<int64_t>(values.size()));
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ std::shared_ptr<Buffer> data, null_bitmap;
+ ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ ARROW_RETURN_NOT_OK(data_builder_.Finish(&data));
+ *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_);
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+ }
/// \brief Append a sequence of elements in one shot
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \return Status
-
template <typename ValuesIter>
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin,
values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
-
- std::copy(values_begin, values_end, raw_data_ + length_);
-
+ data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
UnsafeSetNotNull(length);
return Status::OK();
@@ -137,14 +176,11 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder
{
"version instead");
int64_t length = static_cast<int64_t>(std::distance(values_begin,
values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
-
- std::copy(values_begin, values_end, raw_data_ + length_);
-
- // this updates the length_
- for (int64_t i = 0; i != length; ++i) {
- UnsafeAppendToBitmap(*valid_begin);
- ++valid_begin;
- }
+ data_builder_.UnsafeAppend(values_begin, values_end);
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
@@ -154,71 +190,37 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder
{
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin,
values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
-
- std::copy(values_begin, values_end, raw_data_ + length_);
-
+ data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
- for (int64_t i = 0; i != length; ++i) {
- UnsafeAppendToBitmap(*valid_begin);
- ++valid_begin;
- }
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
}
return Status::OK();
}
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
- void Reset() override;
-
- Status Resize(int64_t capacity) override;
-
- protected:
- std::shared_ptr<ResizableBuffer> data_;
- value_type* raw_data_;
-};
-
-/// Base class for all Builders that emit an Array of a scalar numerical type.
-template <typename T>
-class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder<T> {
- public:
- using typename PrimitiveBuilder<T>::value_type;
- using PrimitiveBuilder<T>::PrimitiveBuilder;
-
- template <typename T1 = T>
- explicit NumericBuilder(
- typename std::enable_if<TypeTraits<T1>::is_parameter_free,
MemoryPool*>::type pool
- ARROW_MEMORY_POOL_DEFAULT)
- : PrimitiveBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
-
- using ArrayBuilder::UnsafeAppendNull;
- using ArrayBuilder::UnsafeAppendToBitmap;
- using PrimitiveBuilder<T>::AppendValues;
- using PrimitiveBuilder<T>::Resize;
- using PrimitiveBuilder<T>::Reserve;
-
- /// Append a single scalar and increase the size if necessary.
- Status Append(const value_type val) {
- ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
- UnsafeAppend(val);
- return Status::OK();
- }
-
/// Append a single scalar under the assumption that the underlying Buffer is
/// large enough.
///
/// This method does not capacity-check; make sure to call Reserve
/// beforehand.
void UnsafeAppend(const value_type val) {
- raw_data_[length_] = val;
- UnsafeAppendToBitmap(true);
+ ArrayBuilder::UnsafeAppendToBitmap(true);
+ data_builder_.UnsafeAppend(val);
+ }
+
+ void UnsafeAppendNull() {
+ ArrayBuilder::UnsafeAppendToBitmap(false);
+ data_builder_.UnsafeAppend(0);
}
protected:
- using PrimitiveBuilder<T>::length_;
- using PrimitiveBuilder<T>::raw_data_;
+ TypedBufferBuilder<value_type> data_builder_;
};
// Builders
@@ -249,21 +251,17 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
explicit BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool*
pool);
- using ArrayBuilder::Advance;
- using ArrayBuilder::UnsafeAppendNull;
-
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated
memory
- Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
+ Status AppendNulls(int64_t length) {
ARROW_RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(valid_bytes, length);
-
+ data_builder_.UnsafeAppend(length, false);
+ UnsafeSetNull(length);
return Status::OK();
}
Status AppendNull() {
ARROW_RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(false);
-
+ UnsafeAppendNull();
return Status::OK();
}
@@ -278,14 +276,15 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
/// Scalar append, without checking for capacity
void UnsafeAppend(const bool val) {
- if (val) {
- BitUtil::SetBit(raw_data_, length_);
- } else {
- BitUtil::ClearBit(raw_data_, length_);
- }
+ data_builder_.UnsafeAppend(val);
UnsafeAppendToBitmap(true);
}
+ void UnsafeAppendNull() {
+ data_builder_.UnsafeAppend(false);
+ UnsafeAppendToBitmap(false);
+ }
+
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
/// \brief Append a sequence of elements in one shot
@@ -340,10 +339,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin,
values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
- auto iter = values_begin;
- internal::GenerateBitsUnrolled(raw_data_, length_, length,
- [&iter]() -> bool { return *(iter++); });
-
+ data_builder_.UnsafeAppend<false>(
+ length, [&values_begin]() -> bool { return *values_begin++; });
// this updates length_
UnsafeSetNotNull(length);
return Status::OK();
@@ -364,15 +361,12 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
int64_t length = static_cast<int64_t>(std::distance(values_begin,
values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
- auto iter = values_begin;
- internal::GenerateBitsUnrolled(raw_data_, length_, length,
- [&iter]() -> bool { return *(iter++); });
-
- // this updates length_
- for (int64_t i = 0; i != length; ++i) {
- ArrayBuilder::UnsafeAppendToBitmap(*valid_begin);
- ++valid_begin;
- }
+ data_builder_.UnsafeAppend<false>(
+ length, [&values_begin]() -> bool { return *values_begin++; });
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
@@ -382,21 +376,17 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin,
values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend<false>(
+ length, [&values_begin]() -> bool { return *values_begin++; });
- auto iter = values_begin;
- internal::GenerateBitsUnrolled(raw_data_, length_, length,
- [&iter]() -> bool { return *(iter++); });
-
- // this updates the length_
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
- for (int64_t i = 0; i != length; ++i) {
- ArrayBuilder::UnsafeAppendToBitmap(*valid_begin);
- ++valid_begin;
- }
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
}
-
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
@@ -405,8 +395,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
Status Resize(int64_t capacity) override;
protected:
- std::shared_ptr<ResizableBuffer> data_;
- uint8_t* raw_data_;
+ TypedBufferBuilder<bool> data_builder_;
};
} // namespace arrow
diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h
index b27fbd8..a843021 100644
--- a/cpp/src/arrow/buffer-builder.h
+++ b/cpp/src/arrow/buffer-builder.h
@@ -24,6 +24,7 @@
#include <cstring>
#include <memory>
#include <string>
+#include <utility>
#include "arrow/buffer.h"
#include "arrow/status.h"
@@ -57,7 +58,6 @@ class ARROW_EXPORT BufferBuilder {
return Status::OK();
}
int64_t old_capacity = capacity_;
-
if (buffer_ == NULLPTR) {
ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity,
&buffer_));
} else {
@@ -131,6 +131,9 @@ class ARROW_EXPORT BufferBuilder {
// Advance pointer and zero out memory
Status Advance(const int64_t length) { return Append(length, 0); }
+ // Advance pointer, but don't allocate or zero memory
+ void UnsafeAdvance(const int64_t length) { size_ += length; }
+
// Unsafe methods don't check existing size
void UnsafeAppend(const void* data, const int64_t length) {
memcpy(data_ + size_, data, static_cast<size_t>(length));
@@ -153,6 +156,7 @@ class ARROW_EXPORT BufferBuilder {
/// \return Status
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
+ if (size_ != 0) buffer_->ZeroPadding();
*out = buffer_;
Reset();
return Status::OK();
@@ -211,6 +215,14 @@ class TypedBufferBuilder<T, typename
std::enable_if<std::is_arithmetic<T>::value
num_elements * sizeof(T));
}
+ template <typename Iter>
+ void UnsafeAppend(Iter values_begin, Iter values_end) {
+ int64_t num_elements = static_cast<int64_t>(std::distance(values_begin,
values_end));
+ auto data = mutable_data() + length();
+ bytes_builder_.UnsafeAdvance(num_elements * sizeof(T));
+ std::copy(values_begin, values_end, data);
+ }
+
void UnsafeAppend(const int64_t num_copies, T value) {
auto data = mutable_data() + length();
bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0);
@@ -284,7 +296,7 @@ class TypedBufferBuilder<bool> {
int64_t i = 0;
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
[&] {
bool value = bytes[i++];
- if (!value) ++false_count_;
+ false_count_ += !value;
return value;
});
bit_length_ += num_elements;
@@ -292,12 +304,27 @@ class TypedBufferBuilder<bool> {
void UnsafeAppend(const int64_t num_copies, bool value) {
BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
- if (!value) {
- false_count_ += num_copies;
- }
+ false_count_ += num_copies * !value;
bit_length_ += num_copies;
}
+ template <bool count_falses, typename Generator>
+ void UnsafeAppend(const int64_t num_elements, Generator&& gen) {
+ if (num_elements == 0) return;
+
+ if (count_falses) {
+ internal::GenerateBitsUnrolled(mutable_data(), bit_length_,
num_elements, [&] {
+ bool value = gen();
+ false_count_ += !value;
+ return value;
+ });
+ } else {
+ internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
+ std::forward<Generator>(gen));
+ }
+ bit_length_ += num_elements;
+ }
+
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
const int64_t old_byte_capacity = bytes_builder_.capacity();
const int64_t new_byte_capacity = BitUtil::BytesForBits(new_capacity);
@@ -320,6 +347,9 @@ class TypedBufferBuilder<bool> {
}
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+ // set bytes_builder_.size_ == byte size of data
+ bytes_builder_.UnsafeAdvance(BitUtil::BytesForBits(bit_length_) -
+ bytes_builder_.length());
bit_length_ = false_count_ = 0;
return bytes_builder_.Finish(out, shrink_to_fit);
}
diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc
index 9ae96e7..e503b28 100644
--- a/cpp/src/arrow/buffer-test.cc
+++ b/cpp/src/arrow/buffer-test.cc
@@ -350,6 +350,8 @@ TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) {
for (int i = 0; i != nvalues; ++i) {
ASSERT_EQ(BitUtil::GetBit(built->data(), i + 1),
static_cast<bool>(values[i]));
}
+
+ ASSERT_EQ(built->size(), BitUtil::BytesForBits(nvalues + 1));
}
TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) {
@@ -367,6 +369,8 @@ TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) {
for (int i = 0; i != 13 + 17; ++i) {
EXPECT_EQ(BitUtil::GetBit(built->data(), i), i < 13) << "index = " << i;
}
+
+ ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17));
}
template <typename T>