This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 3df5ba8bc7 GH-34561: [C++] Implement
RunEndEncodedBuilder::AppendEmptyValues() (#34562)
3df5ba8bc7 is described below
commit 3df5ba8bc72e371cdf4f7ab95e6e7191716c61ab
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Wed Mar 15 11:48:03 2023 -0300
GH-34561: [C++] Implement RunEndEncodedBuilder::AppendEmptyValues() (#34562)
* Closes: #34561
Authored-by: Felipe Oliveira Carvalho <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
cpp/src/arrow/array/array_run_end_test.cc | 28 +++++++++++++++++++++++-----
cpp/src/arrow/array/builder_run_end.cc | 24 ++++++++++++++++++++----
cpp/src/arrow/array/builder_run_end.h | 23 ++++++++++++++++-------
3 files changed, 59 insertions(+), 16 deletions(-)
diff --git a/cpp/src/arrow/array/array_run_end_test.cc
b/cpp/src/arrow/array/array_run_end_test.cc
index 377a127117..f56edf68d5 100644
--- a/cpp/src/arrow/array/array_run_end_test.cc
+++ b/cpp/src/arrow/array/array_run_end_test.cc
@@ -301,22 +301,40 @@ TEST_P(TestRunEndEncodedArray, Builder) {
R"(["unique", null, "common", "common", "appended", "common",
"common", "appended"])"));
continue;
}
- if (step == 10) {
- ASSERT_EQ(builder->length(), 505);
+ // Append empty values
+ ASSERT_OK(builder->AppendEmptyValues(10));
+ if (step == 11) {
+ ASSERT_EQ(builder->length(), 515);
+ ASSERT_OK(BuilderEquals(
+ *builder, 515, "[1, 3, 105, 165, 205, 305, 405, 505, 515]",
+ R"(["unique", null, "common", "common", "appended", "common",
"common", "appended", ""])"));
+ continue;
+ }
+ // Append NULL after empty
+ ASSERT_OK(builder->AppendNull());
+ if (step == 12) {
+ ASSERT_EQ(builder->length(), 516);
+ ASSERT_OK(BuilderEquals(
+ *builder, 516, "[1, 3, 105, 165, 205, 305, 405, 505, 515, 516]",
+ R"(["unique", null, "common", "common", "appended", "common",
"common", "appended", "", null])"));
+ continue;
+ }
+ if (step == 13) {
+ ASSERT_EQ(builder->length(), 516);
ASSERT_EQ(*builder->type(), *run_end_encoded(run_end_type, utf8()));
auto expected_run_ends =
- ArrayFromJSON(run_end_type, "[1, 3, 105, 165, 205, 305, 405, 505]");
+ ArrayFromJSON(run_end_type, "[1, 3, 105, 165, 205, 305, 405, 505,
515, 516]");
auto expected_values = ArrayFromJSON(
value_type,
- R"(["unique", null, "common", "common", "appended", "common",
"common", "appended"])");
+ R"(["unique", null, "common", "common", "appended", "common",
"common", "appended", "", null])");
ASSERT_OK_AND_ASSIGN(auto array, builder->Finish());
auto ree_array = std::dynamic_pointer_cast<RunEndEncodedArray>(array);
ASSERT_NE(ree_array, NULLPTR);
ASSERT_ARRAYS_EQUAL(*expected_run_ends, *ree_array->run_ends());
ASSERT_ARRAYS_EQUAL(*expected_values, *ree_array->values());
- ASSERT_EQ(array->length(), 505);
+ ASSERT_EQ(array->length(), 516);
ASSERT_EQ(array->offset(), 0);
break;
}
diff --git a/cpp/src/arrow/array/builder_run_end.cc
b/cpp/src/arrow/array/builder_run_end.cc
index 31e331c6c8..5908809292 100644
--- a/cpp/src/arrow/array/builder_run_end.cc
+++ b/cpp/src/arrow/array/builder_run_end.cc
@@ -76,7 +76,21 @@ Status RunCompressorBuilder::AppendNulls(int64_t length) {
}
Status RunCompressorBuilder::AppendEmptyValues(int64_t length) {
- return Status::NotImplemented("Append empty values to a run-compressed
array.");
+ if (ARROW_PREDICT_FALSE(length == 0)) {
+ return Status::OK();
+ }
+ // Empty values are usually appended as placeholders for future values, so
+ // we make no attempt at making the empty values appended now part of the
+ // current run. Each AppendEmptyValues() creates its own run of the given
length.
+ ARROW_RETURN_NOT_OK(FinishCurrentRun());
+ {
+ ARROW_RETURN_NOT_OK(WillCloseRunOfEmptyValues(length));
+ ARROW_RETURN_NOT_OK(inner_builder_->AppendEmptyValue());
+ UpdateDimensions();
+ }
+ // Current run remains cleared after FinishCurrentRun() as we don't want to
+ // extend it with empty values potentially coming in the future.
+ return Status::OK();
}
Status RunCompressorBuilder::AppendScalar(const Scalar& scalar, int64_t
n_repeats) {
@@ -183,7 +197,10 @@ Status RunEndEncodedBuilder::AppendNulls(int64_t length) {
}
Status RunEndEncodedBuilder::AppendEmptyValues(int64_t length) {
- return Status::NotImplemented("Append empty values to run-end encoded
array.");
+ RETURN_NOT_OK(value_run_builder_->AppendEmptyValues(length));
+ DCHECK_EQ(value_run_builder_->open_run_length(), 0);
+ UpdateDimensions(committed_logical_length_, 0);
+ return Status::OK();
}
Status RunEndEncodedBuilder::AppendScalar(const Scalar& scalar, int64_t
n_repeats) {
@@ -313,8 +330,7 @@ Status RunEndEncodedBuilder::AppendRunEnd(int64_t run_end) {
return Status::OK();
}
-Status RunEndEncodedBuilder::CloseRun(const std::shared_ptr<const Scalar>&
value,
- int64_t run_length) {
+Status RunEndEncodedBuilder::CloseRun(int64_t run_length) {
// TODO(felipecrv): gracefully fragment runs bigger than INT32_MAX
if (ARROW_PREDICT_FALSE(run_length > std::numeric_limits<int32_t>::max())) {
return Status::Invalid(
diff --git a/cpp/src/arrow/array/builder_run_end.h
b/cpp/src/arrow/array/builder_run_end.h
index cd82ca4a70..9764c57992 100644
--- a/cpp/src/arrow/array/builder_run_end.h
+++ b/cpp/src/arrow/array/builder_run_end.h
@@ -79,6 +79,15 @@ class RunCompressorBuilder : public ArrayBuilder {
return Status::OK();
}
+ /// \brief Called right before a run of empty values is being closed
+ ///
+ /// Subclasses can override this function to perform an additional action
when
+ /// a run of empty values is appended (i.e. run-length is known and a single
+ /// empty value is appended to the inner builder).
+ ///
+ /// \param length the greater than 0 length of the value run being closed
+ virtual Status WillCloseRunOfEmptyValues(int64_t length) { return
Status::OK(); }
+
/// \brief Allocate enough memory for a given number of array elements.
///
/// NOTE: Conservatively resizing a run-length compressed array for a given
@@ -103,8 +112,6 @@ class RunCompressorBuilder : public ArrayBuilder {
Status AppendNull() final { return AppendNulls(1); }
Status AppendNulls(int64_t length) override;
- // These two fail with Status::NotImplemented as it is impossible to compress
- // unknown placeholder values.
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
Status AppendEmptyValues(int64_t length) override;
@@ -179,9 +186,12 @@ class ARROW_EXPORT RunEndEncodedBuilder : public
ArrayBuilder {
~ValueRunBuilder() override = default;
- Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
- int64_t length) override {
- return ree_builder_.CloseRun(value, length);
+ Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length)
override {
+ return ree_builder_.CloseRun(length);
+ }
+
+ Status WillCloseRunOfEmptyValues(int64_t length) override {
+ return ree_builder_.CloseRun(length);
}
private:
@@ -276,8 +286,7 @@ class ARROW_EXPORT RunEndEncodedBuilder : public
ArrayBuilder {
/// length_ to reflect the new run.
///
/// Pre-condition: run_length > 0.
- [[nodiscard]] Status CloseRun(const std::shared_ptr<const Scalar>& value,
- int64_t run_length);
+ [[nodiscard]] Status CloseRun(int64_t run_length);
ArrayBuilder& run_end_builder();
ArrayBuilder& value_builder();