[
https://issues.apache.org/jira/browse/ARROW-1712?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16336822#comment-16336822
]
ASF GitHub Bot commented on ARROW-1712:
---------------------------------------
wesm closed pull request #1481: ARROW-1712: [C++] Add method to BinaryBuilder
to reserve space for value data
URL: https://github.com/apache/arrow/pull/1481
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 7ff3261ec..c53da8591 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1155,6 +1155,45 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
}
}
+TEST_F(TestBinaryBuilder, TestCapacityReserve) {
+ vector<string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc",
"dddddddddd"};
+ int N = static_cast<int>(strings.size());
+ int reps = 15;
+ int64_t length = 0;
+ int64_t capacity = 1000;
+ int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
+
+ ASSERT_OK(builder_->ReserveData(capacity));
+
+ ASSERT_EQ(length, builder_->value_data_length());
+ ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+
+ for (int j = 0; j < reps; ++j) {
+ for (int i = 0; i < N; ++i) {
+ ASSERT_OK(builder_->Append(strings[i]));
+ length += static_cast<int>(strings[i].size());
+
+ ASSERT_EQ(length, builder_->value_data_length());
+ ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+ }
+ }
+
+ int extra_capacity = 500;
+ expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity);
+
+ ASSERT_OK(builder_->ReserveData(extra_capacity));
+
+ ASSERT_EQ(length, builder_->value_data_length());
+ ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+
+ Done();
+
+ ASSERT_EQ(reps * N, result_->length());
+ ASSERT_EQ(0, result_->null_count());
+ ASSERT_EQ(reps * 40, result_->value_data()->size());
+ ASSERT_EQ(expected_capacity, result_->value_data()->capacity());
+}
+
TEST_F(TestBinaryBuilder, TestZeroLength) {
// All buffers are null
Done();
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index b50b1a1aa..44c352a93 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -333,6 +333,7 @@ class ARROW_EXPORT TypedBufferBuilder : public
BufferBuilder {
const T* data() const { return reinterpret_cast<const T*>(data_); }
int64_t length() const { return size_ / sizeof(T); }
+ int64_t capacity() const { return capacity_ / sizeof(T); }
};
/// \brief Allocate a fixed size mutable buffer from a memory pool
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index de132b5f6..db901526f 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -1165,13 +1165,13 @@ Status ListBuilder::Init(int64_t elements) {
DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
RETURN_NOT_OK(ArrayBuilder::Init(elements));
// one more then requested for offsets
- return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
+ return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
}
Status ListBuilder::Resize(int64_t capacity) {
DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
// one more then requested for offsets
- RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+ RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
return ArrayBuilder::Resize(capacity);
}
@@ -1216,16 +1216,26 @@ Status BinaryBuilder::Init(int64_t elements) {
DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
RETURN_NOT_OK(ArrayBuilder::Init(elements));
// one more then requested for offsets
- return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
+ return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
}
Status BinaryBuilder::Resize(int64_t capacity) {
DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
// one more then requested for offsets
- RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+ RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
return ArrayBuilder::Resize(capacity);
}
+Status BinaryBuilder::ReserveData(int64_t elements) {
+ if (value_data_length() + elements > value_data_capacity()) {
+ if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for
binary");
+ }
+ RETURN_NOT_OK(value_data_builder_.Reserve(elements));
+ }
+ return Status::OK();
+}
+
Status BinaryBuilder::AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index ce7b8cd19..d1611f60c 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -682,10 +682,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
Status Init(int64_t elements) override;
Status Resize(int64_t capacity) override;
+ /// \brief Ensures there is enough allocated capacity to append the indicated
+ /// number of bytes to the value data buffer without additional allocations
+ Status ReserveData(int64_t elements);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \return size of values buffer so far
int64_t value_data_length() const { return value_data_builder_.length(); }
+ /// \return capacity of values buffer
+ int64_t value_data_capacity() const { return value_data_builder_.capacity();
}
/// Temporary access to a value.
///
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [C++] Add method to BinaryBuilder to reserve space for value data
> -----------------------------------------------------------------
>
> Key: ARROW-1712
> URL: https://issues.apache.org/jira/browse/ARROW-1712
> Project: Apache Arrow
> Issue Type: Improvement
> Components: C++
> Reporter: Wes McKinney
> Assignee: Panchen Xue
> Priority: Major
> Labels: pull-request-available
> Fix For: 0.9.0
>
>
> The {{Resize}} and {{Reserve}} methods only reserve space for the value
> offsets. When building binary/string arrays with a known size (or some
> reasonable estimate), it would be more efficient to reserve once at the
> beginning to prevent internal reallocations
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)