This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 3df5ba8bc7 GH-34561: [C++] Implement 
RunEndEncodedBuilder::AppendEmptyValues() (#34562)
3df5ba8bc7 is described below

commit 3df5ba8bc72e371cdf4f7ab95e6e7191716c61ab
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Wed Mar 15 11:48:03 2023 -0300

    GH-34561: [C++] Implement RunEndEncodedBuilder::AppendEmptyValues() (#34562)
    
    
    * Closes: #34561
    
    Authored-by: Felipe Oliveira Carvalho <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 cpp/src/arrow/array/array_run_end_test.cc | 28 +++++++++++++++++++++++-----
 cpp/src/arrow/array/builder_run_end.cc    | 24 ++++++++++++++++++++----
 cpp/src/arrow/array/builder_run_end.h     | 23 ++++++++++++++++-------
 3 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/cpp/src/arrow/array/array_run_end_test.cc 
b/cpp/src/arrow/array/array_run_end_test.cc
index 377a127117..f56edf68d5 100644
--- a/cpp/src/arrow/array/array_run_end_test.cc
+++ b/cpp/src/arrow/array/array_run_end_test.cc
@@ -301,22 +301,40 @@ TEST_P(TestRunEndEncodedArray, Builder) {
           R"(["unique", null, "common", "common", "appended", "common", 
"common", "appended"])"));
       continue;
     }
-    if (step == 10) {
-      ASSERT_EQ(builder->length(), 505);
+    // Append empty values
+    ASSERT_OK(builder->AppendEmptyValues(10));
+    if (step == 11) {
+      ASSERT_EQ(builder->length(), 515);
+      ASSERT_OK(BuilderEquals(
+          *builder, 515, "[1, 3, 105, 165, 205, 305, 405, 505, 515]",
+          R"(["unique", null, "common", "common", "appended", "common", 
"common", "appended", ""])"));
+      continue;
+    }
+    // Append NULL after empty
+    ASSERT_OK(builder->AppendNull());
+    if (step == 12) {
+      ASSERT_EQ(builder->length(), 516);
+      ASSERT_OK(BuilderEquals(
+          *builder, 516, "[1, 3, 105, 165, 205, 305, 405, 505, 515, 516]",
+          R"(["unique", null, "common", "common", "appended", "common", 
"common", "appended", "", null])"));
+      continue;
+    }
+    if (step == 13) {
+      ASSERT_EQ(builder->length(), 516);
       ASSERT_EQ(*builder->type(), *run_end_encoded(run_end_type, utf8()));
 
       auto expected_run_ends =
-          ArrayFromJSON(run_end_type, "[1, 3, 105, 165, 205, 305, 405, 505]");
+          ArrayFromJSON(run_end_type, "[1, 3, 105, 165, 205, 305, 405, 505, 
515, 516]");
       auto expected_values = ArrayFromJSON(
           value_type,
-          R"(["unique", null, "common", "common", "appended", "common", 
"common", "appended"])");
+          R"(["unique", null, "common", "common", "appended", "common", 
"common", "appended", "", null])");
 
       ASSERT_OK_AND_ASSIGN(auto array, builder->Finish());
       auto ree_array = std::dynamic_pointer_cast<RunEndEncodedArray>(array);
       ASSERT_NE(ree_array, NULLPTR);
       ASSERT_ARRAYS_EQUAL(*expected_run_ends, *ree_array->run_ends());
       ASSERT_ARRAYS_EQUAL(*expected_values, *ree_array->values());
-      ASSERT_EQ(array->length(), 505);
+      ASSERT_EQ(array->length(), 516);
       ASSERT_EQ(array->offset(), 0);
       break;
     }
diff --git a/cpp/src/arrow/array/builder_run_end.cc 
b/cpp/src/arrow/array/builder_run_end.cc
index 31e331c6c8..5908809292 100644
--- a/cpp/src/arrow/array/builder_run_end.cc
+++ b/cpp/src/arrow/array/builder_run_end.cc
@@ -76,7 +76,21 @@ Status RunCompressorBuilder::AppendNulls(int64_t length) {
 }
 
 Status RunCompressorBuilder::AppendEmptyValues(int64_t length) {
-  return Status::NotImplemented("Append empty values to a run-compressed 
array.");
+  if (ARROW_PREDICT_FALSE(length == 0)) {
+    return Status::OK();
+  }
+  // Empty values are usually appended as placeholders for future values, so
+  // we make no attempt at making the empty values appended now part of the
+  // current run. Each AppendEmptyValues() creates its own run of the given 
length.
+  ARROW_RETURN_NOT_OK(FinishCurrentRun());
+  {
+    ARROW_RETURN_NOT_OK(WillCloseRunOfEmptyValues(length));
+    ARROW_RETURN_NOT_OK(inner_builder_->AppendEmptyValue());
+    UpdateDimensions();
+  }
+  // Current run remains cleared after FinishCurrentRun() as we don't want to
+  // extend it with empty values potentially coming in the future.
+  return Status::OK();
 }
 
 Status RunCompressorBuilder::AppendScalar(const Scalar& scalar, int64_t 
n_repeats) {
@@ -183,7 +197,10 @@ Status RunEndEncodedBuilder::AppendNulls(int64_t length) {
 }
 
 Status RunEndEncodedBuilder::AppendEmptyValues(int64_t length) {
-  return Status::NotImplemented("Append empty values to run-end encoded 
array.");
+  RETURN_NOT_OK(value_run_builder_->AppendEmptyValues(length));
+  DCHECK_EQ(value_run_builder_->open_run_length(), 0);
+  UpdateDimensions(committed_logical_length_, 0);
+  return Status::OK();
 }
 
 Status RunEndEncodedBuilder::AppendScalar(const Scalar& scalar, int64_t 
n_repeats) {
@@ -313,8 +330,7 @@ Status RunEndEncodedBuilder::AppendRunEnd(int64_t run_end) {
   return Status::OK();
 }
 
-Status RunEndEncodedBuilder::CloseRun(const std::shared_ptr<const Scalar>& 
value,
-                                      int64_t run_length) {
+Status RunEndEncodedBuilder::CloseRun(int64_t run_length) {
   // TODO(felipecrv): gracefully fragment runs bigger than INT32_MAX
   if (ARROW_PREDICT_FALSE(run_length > std::numeric_limits<int32_t>::max())) {
     return Status::Invalid(
diff --git a/cpp/src/arrow/array/builder_run_end.h 
b/cpp/src/arrow/array/builder_run_end.h
index cd82ca4a70..9764c57992 100644
--- a/cpp/src/arrow/array/builder_run_end.h
+++ b/cpp/src/arrow/array/builder_run_end.h
@@ -79,6 +79,15 @@ class RunCompressorBuilder : public ArrayBuilder {
     return Status::OK();
   }
 
+  /// \brief Called right before a run of empty values is being closed
+  ///
+  /// Subclasses can override this function to perform an additional action 
when
+  /// a run of empty values is appended (i.e. run-length is known and a single
+  /// empty value is appended to the inner builder).
+  ///
+  /// \param length the greater than 0 length of the value run being closed
+  virtual Status WillCloseRunOfEmptyValues(int64_t length) { return 
Status::OK(); }
+
   /// \brief Allocate enough memory for a given number of array elements.
   ///
   /// NOTE: Conservatively resizing a run-length compressed array for a given
@@ -103,8 +112,6 @@ class RunCompressorBuilder : public ArrayBuilder {
   Status AppendNull() final { return AppendNulls(1); }
   Status AppendNulls(int64_t length) override;
 
-  // These two fail with Status::NotImplemented as it is impossible to compress
-  // unknown placeholder values.
   Status AppendEmptyValue() final { return AppendEmptyValues(1); }
   Status AppendEmptyValues(int64_t length) override;
 
@@ -179,9 +186,12 @@ class ARROW_EXPORT RunEndEncodedBuilder : public 
ArrayBuilder {
 
     ~ValueRunBuilder() override = default;
 
-    Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
-                        int64_t length) override {
-      return ree_builder_.CloseRun(value, length);
+    Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) 
override {
+      return ree_builder_.CloseRun(length);
+    }
+
+    Status WillCloseRunOfEmptyValues(int64_t length) override {
+      return ree_builder_.CloseRun(length);
     }
 
    private:
@@ -276,8 +286,7 @@ class ARROW_EXPORT RunEndEncodedBuilder : public 
ArrayBuilder {
   /// length_ to reflect the new run.
   ///
   /// Pre-condition: run_length > 0.
-  [[nodiscard]] Status CloseRun(const std::shared_ptr<const Scalar>& value,
-                                int64_t run_length);
+  [[nodiscard]] Status CloseRun(int64_t run_length);
 
   ArrayBuilder& run_end_builder();
   ArrayBuilder& value_builder();

Reply via email to