This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c99d318b33 GH-37014: [C++][Parquet] Preserve some Parquet distinct 
counts when merging stats (#37016)
c99d318b33 is described below

commit c99d318b3306f704b1fa39bdffe15d62e4c5d43b
Author: Elliott Brossard <[email protected]>
AuthorDate: Thu Aug 31 06:01:03 2023 -0700

    GH-37014: [C++][Parquet] Preserve some Parquet distinct counts when merging 
stats (#37016)
    
    ### Rationale for this change
    
    Statistics merging is unnecessarily conservative with respect to preserving 
distinct counts. If either side is 0, the merge stats can preserve the distinct 
count, instead of clearing it.
    
    ### What changes are included in this PR?
    
    Distinct count is preserved if either the lhs or rhs has a distinct count 
of 0 when merging stats.
    
    ### Are these changes tested?
    
    Yes, added a unit test.
    
    ### Are there any user-facing changes?
    
    I don't think this change to stats is considered to be user-facing.
    * Closes: #37014
    
    Lead-authored-by: Elliott Brossard <[email protected]>
    Co-authored-by: Elliott Brossard 
<[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/statistics.cc      | 15 ++++++--
 cpp/src/parquet/statistics_test.cc | 76 ++++++++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index a10fc2a4cf..d5fc71b395 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -562,8 +562,15 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     } else {
       this->has_null_count_ = false;
     }
-    // Clear has_distinct_count_ as distinct count cannot be merged.
-    has_distinct_count_ = false;
+    if (has_distinct_count_ && other.HasDistinctCount() &&
+        (distinct_count() == 0 || other.distinct_count() == 0)) {
+      // We can merge distinct counts if either side is zero.
+      statistics_.distinct_count =
+          std::max(statistics_.distinct_count, other.distinct_count());
+    } else {
+      // Otherwise clear has_distinct_count_ as distinct count cannot be 
merged.
+      this->has_distinct_count_ = false;
+    }
     // Do not clear min/max here if the other side does not provide
     // min/max which may happen when other is an empty stats or all
     // its values are null and/or NaN.
@@ -621,7 +628,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
       // num_values_ is reliable and it means number of non-null values.
       s.all_null_value = num_values_ == 0;
     }
-    // TODO (GH-36505): distinct count is not encoded for now.
+    if (HasDistinctCount()) {
+      s.set_distinct_count(this->distinct_count());
+    }
     return s;
   }
 
diff --git a/cpp/src/parquet/statistics_test.cc 
b/cpp/src/parquet/statistics_test.cc
index 76667f0029..637832945e 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -601,6 +601,39 @@ class TestStatisticsHasFlag : public 
TestStatistics<TestType> {
     this->SetUpSchema(Repetition::OPTIONAL);
   }
 
+  std::optional<int64_t> MergeDistinctCount(
+      std::optional<int64_t> initial,
+      const std::vector<std::optional<int64_t>>& subsequent) {
+    EncodedStatistics encoded_statistics;
+    if (initial) {
+      encoded_statistics.has_distinct_count = true;
+      encoded_statistics.distinct_count = *initial;
+    }
+    std::shared_ptr<TypedStatistics<TestType>> statistics =
+        std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+            Statistics::Make(this->schema_.Column(0), &encoded_statistics,
+                             /*num_values=*/1000));
+    for (const auto& distinct_count : subsequent) {
+      EncodedStatistics next_encoded_statistics;
+      if (distinct_count) {
+        next_encoded_statistics.has_distinct_count = true;
+        next_encoded_statistics.distinct_count = *distinct_count;
+      }
+      std::shared_ptr<TypedStatistics<TestType>> next_statistics =
+          std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+              Statistics::Make(this->schema_.Column(0), 
&next_encoded_statistics,
+                               /*num_values=*/1000));
+      statistics->Merge(*next_statistics);
+    }
+    EncodedStatistics final_statistics = statistics->Encode();
+    EXPECT_EQ(statistics->HasDistinctCount(), 
final_statistics.has_distinct_count);
+    if (statistics->HasDistinctCount()) {
+      EXPECT_EQ(statistics->distinct_count(), final_statistics.distinct_count);
+      return statistics->distinct_count();
+    }
+    return std::nullopt;
+  }
+
   std::shared_ptr<TypedStatistics<TestType>> MergedStatistics(
       const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
     auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
@@ -616,35 +649,22 @@ class TestStatisticsHasFlag : public 
TestStatistics<TestType> {
     ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get()));
   }
 
-  // Distinct count should set to false when Merge is called.
+  // Distinct count should set to false when Merge is called, unless one of 
the statistics
+  // has a zero count.
   void TestMergeDistinctCount() {
-    // Create a statistics object with distinct count.
-    std::shared_ptr<TypedStatistics<TestType>> statistics1;
-    {
-      EncodedStatistics encoded_statistics1;
-      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
-          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
-                           /*num_values=*/1000));
-      EXPECT_FALSE(statistics1->HasDistinctCount());
-    }
-
-    // Create a statistics object with distinct count.
-    std::shared_ptr<TypedStatistics<TestType>> statistics2;
-    {
-      EncodedStatistics encoded_statistics2;
-      encoded_statistics2.has_distinct_count = true;
-      encoded_statistics2.distinct_count = 500;
-      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
-          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
-                           /*num_values=*/1000));
-      EXPECT_TRUE(statistics2->HasDistinctCount());
-    }
-
-    VerifyMergedStatistics(*statistics1, *statistics2,
-                           [](TypedStatistics<TestType>* merged_statistics) {
-                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
-                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
-                           });
+    // Sanity tests.
+    ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {}));
+    ASSERT_EQ(10, MergeDistinctCount(10, {}));
+
+    ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {0}));
+    ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {10, 0}));
+    ASSERT_EQ(10, MergeDistinctCount(10, {0, 0}));
+    ASSERT_EQ(10, MergeDistinctCount(0, {10, 0}));
+    ASSERT_EQ(10, MergeDistinctCount(0, {0, 10}));
+    ASSERT_EQ(10, MergeDistinctCount(0, {0, 10, 0}));
+    ASSERT_EQ(std::nullopt, MergeDistinctCount(10, {0, 10}));
+    ASSERT_EQ(std::nullopt, MergeDistinctCount(10, {0, std::nullopt}));
+    ASSERT_EQ(std::nullopt, MergeDistinctCount(0, {std::nullopt, 0}));
   }
 
   // If all values in a page are null or nan, its stats should not set min-max.

Reply via email to