pitrou commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1245458806
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -377,6 +380,79 @@ class TestStatistics : public PrimitiveTypedTest<TestType>
{
ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
}
+ void TestMergeEmpty() {
+ EncodedStatistics encoded_statistics1;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+
+ EXPECT_FALSE(statistics1->HasMinMax());
+ EXPECT_FALSE(statistics1->HasDistinctCount());
+ EXPECT_FALSE(statistics1->HasNullCount());
+
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_distinct_count = true;
+ encoded_statistics2.distinct_count = 500;
+ auto statistics2 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics2,
+ /*num_values=*/1000);
+
+ EXPECT_FALSE(statistics2->HasMinMax());
+ EXPECT_TRUE(statistics2->HasDistinctCount());
+ EXPECT_FALSE(s1->HasNullCount());
+ auto s2 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics2);
+
+ auto total = MakeStatistics<TestType>(this->schema_.Column(0));
+ total->Merge(*s1);
+ total->Merge(*s2);
+
+ EXPECT_FALSE(total->HasDistinctCount());
+ EXPECT_FALSE(total->HasMinMax());
+ EXPECT_EQ(2000, total->num_values());
+ EXPECT_FALSE(total->HasNullCount());
+ }
+
+ void TestNotHasNullValue() {
+ EncodedStatistics encoded_statistics1;
+ encoded_statistics1.has_null_count = false;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+ EXPECT_FALSE(s1->HasNullCount());
+ auto encoded = s1->Encode();
+ EXPECT_FALSE(encoded.all_null_value);
+ }
+
+ void TestMergeMinMax() {
+ this->GenerateData(1000);
+
+ auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+
+ {
+ auto page_statistics1 =
MakeStatistics<TestType>(this->schema_.Column(0));
+ std::vector<uint8_t> valid_bits(
+ bit_util::BytesForBits(static_cast<uint32_t>(this->values_.size()))
+ 1, 0);
Review Comment:
`valid_bits` is not even used, is it?
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -377,6 +380,79 @@ class TestStatistics : public PrimitiveTypedTest<TestType>
{
ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
}
+ void TestMergeEmpty() {
+ EncodedStatistics encoded_statistics1;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+
+ EXPECT_FALSE(statistics1->HasMinMax());
+ EXPECT_FALSE(statistics1->HasDistinctCount());
+ EXPECT_FALSE(statistics1->HasNullCount());
+
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_distinct_count = true;
+ encoded_statistics2.distinct_count = 500;
+ auto statistics2 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics2,
+ /*num_values=*/1000);
+
+ EXPECT_FALSE(statistics2->HasMinMax());
+ EXPECT_TRUE(statistics2->HasDistinctCount());
+ EXPECT_FALSE(s1->HasNullCount());
+ auto s2 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics2);
+
+ auto total = MakeStatistics<TestType>(this->schema_.Column(0));
+ total->Merge(*s1);
+ total->Merge(*s2);
+
+ EXPECT_FALSE(total->HasDistinctCount());
+ EXPECT_FALSE(total->HasMinMax());
+ EXPECT_EQ(2000, total->num_values());
+ EXPECT_FALSE(total->HasNullCount());
+ }
+
+ void TestNotHasNullValue() {
+ EncodedStatistics encoded_statistics1;
+ encoded_statistics1.has_null_count = false;
+ auto statistics1 = Statistics::Make(this->schema_.Column(0),
&encoded_statistics1,
+ /*num_values=*/1000);
+ auto s1 =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+ EXPECT_FALSE(s1->HasNullCount());
+ auto encoded = s1->Encode();
+ EXPECT_FALSE(encoded.all_null_value);
+ }
+
+ void TestMergeMinMax() {
+ this->GenerateData(1000);
+
+ auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+
+ {
+ auto page_statistics1 =
MakeStatistics<TestType>(this->schema_.Column(0));
+ std::vector<uint8_t> valid_bits(
+ bit_util::BytesForBits(static_cast<uint32_t>(this->values_.size()))
+ 1, 0);
+ page_statistics1->Update(this->values_ptr_, /*num_values=*/0,
+ /*null_count*/ this->values_.size());
+ auto encoded_stats1 = page_statistics1->Encode();
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+
+ chunk_statistics->Merge(*page_statistics1);
+ encoded_stats1 = chunk_statistics->Encode();
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+ }
+ {
+ auto page_statistics2 =
MakeStatistics<TestType>(this->schema_.Column(0));
+ page_statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+
+ chunk_statistics->Merge(*page_statistics2);
+ auto encoded_stats2 = chunk_statistics->Encode();
+ EXPECT_TRUE(encoded_stats2.has_min);
+ EXPECT_TRUE(encoded_stats2.has_max);
Review Comment:
Hmm... why are those true? Also, why not also check the actual min/max
values?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]