wgtmac commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1252560357


##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count*/ this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics read from thrift, it might not have null_count,
+  // and page merge with it should also not have null_count.
+  void TestMergeNullCount() {
+    this->GenerateData(1000);

Review Comment:
   ```suggestion
       this->GenerateData(/*num_values=*/1000);
   ```



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count*/ this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics read from thrift, it might not have null_count,
+  // and page merge with it should also not have null_count.
+  void TestMergeNullCount() {
+    this->GenerateData(1000);
+
+    // Page should have null-count even if no nulls
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, 
/*num_values=*/this->values_.size(),
+                          /*null_count=*/0);
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_TRUE(statistics1->HasNullCount());
+      EXPECT_EQ(0, statistics1->null_count());
+      EXPECT_TRUE(statistics1->Encode().has_null_count);
+    }
+    // Merge with null-count should also have null count
+    VerifyMergedStatistics(*statistics1, *statistics1,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasNullCount());
+                             EXPECT_EQ(0, merged_statistics->null_count());
+                             auto encoded = merged_statistics->Encode();
+                             EXPECT_TRUE(encoded.has_null_count);
+                             EXPECT_EQ(0, encoded.null_count);
+                           });
+
+    // When loaded from thrift, might not have null count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_null_count = false;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics2->Encode().has_null_count);
+      EXPECT_FALSE(statistics2->HasNullCount());
+    }
+
+    // Merge without null-count should not have null count
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_FALSE(merged_statistics->HasNullCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_null_count);
+                           });
+  }
+
+  // statistics.all_null_value is used to build the page index.
+  // If statistics doesn't have null count, all_null_value should be false.
+  void TestNotHasNullValue() {
+    EncodedStatistics encoded_statistics1;
+    encoded_statistics1.has_null_count = false;
+    auto statistics1 = Statistics::Make(this->schema_.Column(0), 
&encoded_statistics1,
+                                        /*num_values=*/1000);
+    auto s1 = 
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+    EXPECT_FALSE(s1->HasNullCount());
+    auto encoded = s1->Encode();
+    EXPECT_FALSE(encoded.all_null_value);
+  }

Review Comment:
   ```suggestion
     void TestNotHasNullValue() {
       EncodedStatistics encoded_statistics;
       encoded_statistics.has_null_count = false;
       auto statistics = Statistics::Make(this->schema_.Column(0), 
&encoded_statistics,
                                          /*num_values=*/1000);
       auto typed_stats = 
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics);
       EXPECT_FALSE(typed_stats->HasNullCount());
       auto encoded = typed_stats->Encode();
       EXPECT_FALSE(encoded.all_null_value);
     }
   ```



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count*/ this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics read from thrift, it might not have null_count,
+  // and page merge with it should also not have null_count.

Review Comment:
   ```suggestion
     // Default statistics should have null_count even if no nulls is written.
     // However, if statistics is created from thrift message, it might not
     // have null_count. Merging statistics from such page will result in an
     // invalid null_count as well.
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +555,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType> 
{
 
   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
+    // Merge always runs when Merge builder's page statistics
+    // into column chunk statistics, so it usually has null count.

Review Comment:
   ```suggestion
       // null_count is always valid when merging page statistics into
       // column chunk statistics.
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +556,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType> 
{
 
   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
+    // Merge always runs when Merge builder's page statistics
+    // into column chunk statistics, so it usually has null count.

Review Comment:
   OK, thanks for confirming!



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count*/ this->values_.size());

Review Comment:
   ```suggestion
                             /*null_count=*/ this->values_.size());
   ```



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());

Review Comment:
   Thanks for the info!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to