mapleFU commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1245495218
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -377,6 +380,79 @@ class TestStatistics : public PrimitiveTypedTest<TestType>
{
ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
}
+ void TestMergeEmpty() {
+ EncodedStatistics encoded_statistics1;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+
+ EXPECT_FALSE(statistics1->HasMinMax());
+ EXPECT_FALSE(statistics1->HasDistinctCount());
+ EXPECT_FALSE(statistics1->HasNullCount());
+
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_distinct_count = true;
+ encoded_statistics2.distinct_count = 500;
+ auto statistics2 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics2,
+ /*num_values=*/1000);
+
+ EXPECT_FALSE(statistics2->HasMinMax());
+ EXPECT_TRUE(statistics2->HasDistinctCount());
+ EXPECT_FALSE(s1->HasNullCount());
+ auto s2 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics2);
+
+ auto total = MakeStatistics<TestType>(this->schema_.Column(0));
+ total->Merge(*s1);
+ total->Merge(*s2);
+
+ EXPECT_FALSE(total->HasDistinctCount());
+ EXPECT_FALSE(total->HasMinMax());
+ EXPECT_EQ(2000, total->num_values());
+ EXPECT_FALSE(total->HasNullCount());
+ }
+
+ void TestNotHasNullValue() {
+ EncodedStatistics encoded_statistics1;
+ encoded_statistics1.has_null_count = false;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+ EXPECT_FALSE(s1->HasNullCount());
+ auto encoded = s1->Encode();
+ EXPECT_FALSE(encoded.all_null_value);
+ }
+
+ void TestMergeMinMax() {
+ this->GenerateData(1000);
+
+ auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+
+ {
+ auto page_statistics1 =
MakeStatistics<TestType>(this->schema_.Column(0));
+ std::vector<uint8_t> valid_bits(
+ bit_util::BytesForBits(static_cast<uint32_t>(this->values_.size()))
+ 1, 0);
+ page_statistics1->Update(this->values_ptr_, /*num_values=*/0,
+ /*null_count*/ this->values_.size());
+ auto encoded_stats1 = page_statistics1->Encode();
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+
+ chunk_statistics->Merge(*page_statistics1);
+ encoded_stats1 = chunk_statistics->Encode();
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+ }
+ {
+ auto page_statistics2 =
MakeStatistics<TestType>(this->schema_.Column(0));
+ page_statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+
+ chunk_statistics->Merge(*page_statistics2);
+ auto encoded_stats2 = chunk_statistics->Encode();
+ EXPECT_TRUE(encoded_stats2.has_min);
+ EXPECT_TRUE(encoded_stats2.has_max);
Review Comment:
Assume there a two pages, one is `[null, null, ..., null]`, so it doesn't
has min-max
The second one is `[1, 1, 1, ..., 1]`, so, finally the chunk statistics
would has min-max.
This is same as the behavior in `main` branch
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]