mapleFU commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1245504781
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -377,6 +380,79 @@ class TestStatistics : public PrimitiveTypedTest<TestType>
{
ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
}
+ void TestMergeEmpty() {
+ EncodedStatistics encoded_statistics1;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+
+ EXPECT_FALSE(statistics1->HasMinMax());
+ EXPECT_FALSE(statistics1->HasDistinctCount());
+ EXPECT_FALSE(statistics1->HasNullCount());
+
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_distinct_count = true;
+ encoded_statistics2.distinct_count = 500;
+ auto statistics2 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics2,
+ /*num_values=*/1000);
+
+ EXPECT_FALSE(statistics2->HasMinMax());
+ EXPECT_TRUE(statistics2->HasDistinctCount());
+ EXPECT_FALSE(s1->HasNullCount());
+ auto s2 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics2);
+
+ auto total = MakeStatistics<TestType>(this->schema_.Column(0));
+ total->Merge(*s1);
+ total->Merge(*s2);
+
+ EXPECT_FALSE(total->HasDistinctCount());
+ EXPECT_FALSE(total->HasMinMax());
+ EXPECT_EQ(2000, total->num_values());
+ EXPECT_FALSE(total->HasNullCount());
+ }
+
+ void TestNotHasNullValue() {
+ EncodedStatistics encoded_statistics1;
+ encoded_statistics1.has_null_count = false;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+ EXPECT_FALSE(s1->HasNullCount());
+ auto encoded = s1->Encode();
+ EXPECT_FALSE(encoded.all_null_value);
+ }
+
+ void TestMergeMinMax() {
+ this->GenerateData(1000);
+
+ auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+
+ {
+ auto page_statistics1 =
MakeStatistics<TestType>(this->schema_.Column(0));
+ std::vector<uint8_t> valid_bits(
+ bit_util::BytesForBits(static_cast<uint32_t>(this->values_.size()))
+ 1, 0);
+ page_statistics1->Update(this->values_ptr_, /*num_values=*/0,
+ /*null_count*/ this->values_.size());
+ auto encoded_stats1 = page_statistics1->Encode();
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+
+ chunk_statistics->Merge(*page_statistics1);
+ encoded_stats1 = chunk_statistics->Encode();
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+ }
+ {
+ auto page_statistics2 =
MakeStatistics<TestType>(this->schema_.Column(0));
+ page_statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+
+ chunk_statistics->Merge(*page_statistics2);
+ auto encoded_stats2 = chunk_statistics->Encode();
+ EXPECT_TRUE(encoded_stats2.has_min);
+ EXPECT_TRUE(encoded_stats2.has_max);
Review Comment:
Or we can distinct from `all is null or nan` and `not has min-max, cannot be
merged`. I prefer to compatible with previous currently
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]