sfc-gh-ebrossard commented on code in PR #37016:
URL: https://github.com/apache/arrow/pull/37016#discussion_r1307787774
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -645,6 +652,50 @@ class TestStatisticsHasFlag : public
TestStatistics<TestType> {
EXPECT_FALSE(merged_statistics->HasDistinctCount());
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
});
+
+ // Create a statistics object with zero distinct count. Merging preserves
the distinct
+ // count if either side is zero.
+ std::shared_ptr<TypedStatistics<TestType>> statistics3;
+ std::shared_ptr<TypedStatistics<TestType>> statistics4;
+ {
+ EncodedStatistics encoded_statistics3;
+ encoded_statistics3.has_distinct_count = true;
+ encoded_statistics3.distinct_count = 0;
+ statistics3 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics3,
+ /*num_values=*/0));
+ EXPECT_TRUE(statistics3->HasDistinctCount());
+
+ EncodedStatistics encoded_statistics4;
+ encoded_statistics4.has_distinct_count = true;
+ encoded_statistics4.distinct_count = 10;
+ statistics4 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics4,
+ /*num_values=*/10));
+ EXPECT_TRUE(statistics4->HasDistinctCount());
+ }
Review Comment:
Added a `MergeDistinctCount` test helper; PTAL.
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -602,18 +602,25 @@ class TestStatisticsHasFlag : public
TestStatistics<TestType> {
}
std::shared_ptr<TypedStatistics<TestType>> MergedStatistics(
- const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2) {
- auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+ const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2,
+ const EncodedStatistics* initial_statistics) {
+ auto chunk_statistics =
std::static_pointer_cast<TypedStatistics<TestType>>(
+ initial_statistics != nullptr
+ ? Statistics::Make(this->schema_.Column(0), initial_statistics)
+ : Statistics::Make(this->schema_.Column(0)));
chunk_statistics->Merge(stats1);
chunk_statistics->Merge(stats2);
return chunk_statistics;
}
void VerifyMergedStatistics(
const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2,
- const std::function<void(TypedStatistics<TestType>*)>& test_fn) {
- ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats1, stats2).get()));
- ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get()));
+ const std::function<void(TypedStatistics<TestType>*)>& test_fn,
+ const EncodedStatistics* initial_statistics = nullptr) {
Review Comment:
Now we just use `MergeDistinctCount` for the distinct count tests, so I
removed the overload.
##########
cpp/src/parquet/statistics.cc:
##########
@@ -621,7 +628,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
// num_values_ is reliable and it means number of non-null values.
s.all_null_value = num_values_ == 0;
}
- // TODO (GH-36505): distinct count is not encoded for now.
+ if (HasDistinctCount()) {
Review Comment:
Sorry, by, "That seems ok to me," do you mean to leave this here? Or to
remove it?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]