This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c99d318b33 GH-37014: [C++][Parquet] Preserve some Parquet distinct
counts when merging stats (#37016)
c99d318b33 is described below
commit c99d318b3306f704b1fa39bdffe15d62e4c5d43b
Author: Elliott Brossard <[email protected]>
AuthorDate: Thu Aug 31 06:01:03 2023 -0700
GH-37014: [C++][Parquet] Preserve some Parquet distinct counts when merging
stats (#37016)
### Rationale for this change
Statistics merging is unnecessarily conservative with respect to preserving
distinct counts. If either side is 0, the merge stats can preserve the distinct
count, instead of clearing it.
### What changes are included in this PR?
Distinct count is preserved if either the lhs or rhs has a distinct count
of 0 when merging stats.
### Are these changes tested?
Yes, added a unit test.
### Are there any user-facing changes?
I don't think this change to stats is considered to be user-facing.
* Closes: #37014
Lead-authored-by: Elliott Brossard <[email protected]>
Co-authored-by: Elliott Brossard
<[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/statistics.cc | 15 ++++++--
cpp/src/parquet/statistics_test.cc | 76 ++++++++++++++++++++++++--------------
2 files changed, 60 insertions(+), 31 deletions(-)
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index a10fc2a4cf..d5fc71b395 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -562,8 +562,15 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
} else {
this->has_null_count_ = false;
}
- // Clear has_distinct_count_ as distinct count cannot be merged.
- has_distinct_count_ = false;
+ if (has_distinct_count_ && other.HasDistinctCount() &&
+ (distinct_count() == 0 || other.distinct_count() == 0)) {
+ // We can merge distinct counts if either side is zero.
+ statistics_.distinct_count =
+ std::max(statistics_.distinct_count, other.distinct_count());
+ } else {
+ // Otherwise clear has_distinct_count_ as distinct count cannot be
merged.
+ this->has_distinct_count_ = false;
+ }
// Do not clear min/max here if the other side does not provide
// min/max which may happen when other is an empty stats or all
// its values are null and/or NaN.
@@ -621,7 +628,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
// num_values_ is reliable and it means number of non-null values.
s.all_null_value = num_values_ == 0;
}
- // TODO (GH-36505): distinct count is not encoded for now.
+ if (HasDistinctCount()) {
+ s.set_distinct_count(this->distinct_count());
+ }
return s;
}
diff --git a/cpp/src/parquet/statistics_test.cc
b/cpp/src/parquet/statistics_test.cc
index 76667f0029..637832945e 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -601,6 +601,39 @@ class TestStatisticsHasFlag : public
TestStatistics<TestType> {
this->SetUpSchema(Repetition::OPTIONAL);
}
+ std::optional<int64_t> MergeDistinctCount(
+ std::optional<int64_t> initial,
+ const std::vector<std::optional<int64_t>>& subsequent) {
+ EncodedStatistics encoded_statistics;
+ if (initial) {
+ encoded_statistics.has_distinct_count = true;
+ encoded_statistics.distinct_count = *initial;
+ }
+ std::shared_ptr<TypedStatistics<TestType>> statistics =
+ std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics,
+ /*num_values=*/1000));
+ for (const auto& distinct_count : subsequent) {
+ EncodedStatistics next_encoded_statistics;
+ if (distinct_count) {
+ next_encoded_statistics.has_distinct_count = true;
+ next_encoded_statistics.distinct_count = *distinct_count;
+ }
+ std::shared_ptr<TypedStatistics<TestType>> next_statistics =
+ std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0),
&next_encoded_statistics,
+ /*num_values=*/1000));
+ statistics->Merge(*next_statistics);
+ }
+ EncodedStatistics final_statistics = statistics->Encode();
+ EXPECT_EQ(statistics->HasDistinctCount(),
final_statistics.has_distinct_count);
+ if (statistics->HasDistinctCount()) {
+ EXPECT_EQ(statistics->distinct_count(), final_statistics.distinct_count);
+ return statistics->distinct_count();
+ }
+ return std::nullopt;
+ }
+
std::shared_ptr<TypedStatistics<TestType>> MergedStatistics(
const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2) {
auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
@@ -616,35 +649,22 @@ class TestStatisticsHasFlag : public
TestStatistics<TestType> {
ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get()));
}
- // Distinct count should set to false when Merge is called.
+ // Distinct count should set to false when Merge is called, unless one of
the statistics
+ // has a zero count.
void TestMergeDistinctCount() {
- // Create a statistics object with distinct count.
- std::shared_ptr<TypedStatistics<TestType>> statistics1;
- {
- EncodedStatistics encoded_statistics1;
- statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
- Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
- /*num_values=*/1000));
- EXPECT_FALSE(statistics1->HasDistinctCount());
- }
-
- // Create a statistics object with distinct count.
- std::shared_ptr<TypedStatistics<TestType>> statistics2;
- {
- EncodedStatistics encoded_statistics2;
- encoded_statistics2.has_distinct_count = true;
- encoded_statistics2.distinct_count = 500;
- statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
- Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
- /*num_values=*/1000));
- EXPECT_TRUE(statistics2->HasDistinctCount());
- }
-
- VerifyMergedStatistics(*statistics1, *statistics2,
- [](TypedStatistics<TestType>* merged_statistics) {
-
EXPECT_FALSE(merged_statistics->HasDistinctCount());
-
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
- });
+ // Sanity tests.
+ ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {}));
+ ASSERT_EQ(10, MergeDistinctCount(10, {}));
+
+ ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {0}));
+ ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {10, 0}));
+ ASSERT_EQ(10, MergeDistinctCount(10, {0, 0}));
+ ASSERT_EQ(10, MergeDistinctCount(0, {10, 0}));
+ ASSERT_EQ(10, MergeDistinctCount(0, {0, 10}));
+ ASSERT_EQ(10, MergeDistinctCount(0, {0, 10, 0}));
+ ASSERT_EQ(std::nullopt, MergeDistinctCount(10, {0, 10}));
+ ASSERT_EQ(std::nullopt, MergeDistinctCount(10, {0, std::nullopt}));
+ ASSERT_EQ(std::nullopt, MergeDistinctCount(0, {std::nullopt, 0}));
}
// If all values in a page are null or nan, its stats should not set min-max.