This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 60fdc25583 GH-34351: [C++][Parquet] Statistics: add detail
documentation and tiny optimization (#35989)
60fdc25583 is described below
commit 60fdc25583cb391871cd7dd67372e8d9df9f0f45
Author: mwish <[email protected]>
AuthorDate: Fri Jul 7 01:10:30 2023 +0800
GH-34351: [C++][Parquet] Statistics: add detail documentation and tiny
optimization (#35989)
### Rationale for this change
### What changes are included in this PR?
1. This patch does some tiny optimizations on Parquet C++ Statistics. It
does:
```
For min-max, using std::string. Because assume the case like that:
EncodedStatistics c1;
// do some operations
EncodedStatistics c2 = c1;
c2.set_max("dasdasdassd");
After c2 set, c1 would be set too. So I use std::string here.
```
2. Force clear ndv count during merging, and set `has_distinct_count_ =
false`, and add some comments
3. Add some specification in Statistics API
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
* Closes: #34351
Lead-authored-by: mwish <[email protected]>
Co-authored-by: mwish <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Gang Wu <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
c_glib/test/parquet/test-statistics.rb | 2 +-
cpp/src/parquet/statistics.cc | 59 +++++++---
cpp/src/parquet/statistics.h | 27 ++---
cpp/src/parquet/statistics_test.cc | 191 +++++++++++++++++++++++++++++++++
4 files changed, 249 insertions(+), 30 deletions(-)
diff --git a/c_glib/test/parquet/test-statistics.rb
b/c_glib/test/parquet/test-statistics.rb
index 4f21ebf00c..0367084c88 100644
--- a/c_glib/test/parquet/test-statistics.rb
+++ b/c_glib/test/parquet/test-statistics.rb
@@ -51,7 +51,7 @@ class TestParquetStatistics < Test::Unit::TestCase
test("#has_n_distinct_values?") do
assert do
- @statistics.has_n_distinct_values?
+ not @statistics.has_n_distinct_values?
end
end
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index aa6df7e32a..a10fc2a4cf 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -463,6 +463,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
public:
using T = typename DType::c_type;
+ // Create an empty stats.
TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
: descr_(descr),
pool_(pool),
@@ -471,10 +472,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
auto comp = Comparator::Make(descr);
comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
TypedStatisticsImpl::Reset();
- has_null_count_ = true;
- has_distinct_count_ = true;
}
+ // Create stats from provided values.
TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t
null_count,
int64_t distinct_count)
: pool_(default_memory_pool()),
@@ -482,24 +482,29 @@ class TypedStatisticsImpl : public TypedStatistics<DType>
{
max_buffer_(AllocateBuffer(pool_, 0)) {
TypedStatisticsImpl::IncrementNumValues(num_values);
TypedStatisticsImpl::IncrementNullCount(null_count);
- IncrementDistinctCount(distinct_count);
+ SetDistinctCount(distinct_count);
Copy(min, &min_, min_buffer_.get());
Copy(max, &max_, max_buffer_.get());
has_min_max_ = true;
}
+ // Create stats from a thrift Statistics object.
TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string&
encoded_min,
const std::string& encoded_max, int64_t num_values,
int64_t null_count, int64_t distinct_count, bool
has_min_max,
bool has_null_count, bool has_distinct_count,
MemoryPool* pool)
: TypedStatisticsImpl(descr, pool) {
TypedStatisticsImpl::IncrementNumValues(num_values);
- if (has_null_count_) {
+ if (has_null_count) {
TypedStatisticsImpl::IncrementNullCount(null_count);
+ } else {
+ has_null_count_ = false;
}
if (has_distinct_count) {
- IncrementDistinctCount(distinct_count);
+ SetDistinctCount(distinct_count);
+ } else {
+ has_distinct_count_ = false;
}
if (!encoded_min.empty()) {
@@ -541,9 +546,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
void Reset() override {
ResetCounts();
- has_min_max_ = false;
- has_distinct_count_ = false;
- has_null_count_ = false;
+ ResetHasFlags();
}
void SetMinMax(const T& arg_min, const T& arg_max) override {
@@ -552,12 +555,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType>
{
void Merge(const TypedStatistics<DType>& other) override {
this->num_values_ += other.num_values();
+ // null_count is always valid when merging page statistics into
+ // column chunk statistics.
if (other.HasNullCount()) {
this->statistics_.null_count += other.null_count();
+ } else {
+ this->has_null_count_ = false;
}
- if (other.HasDistinctCount()) {
- this->statistics_.distinct_count += other.distinct_count();
- }
+ // Clear has_distinct_count_ as distinct count cannot be merged.
+ has_distinct_count_ = false;
+ // Do not clear min/max here if the other side does not provide
+ // min/max which may happen when other is an empty stats or all
+ // its values are null and/or NaN.
if (other.HasMinMax()) {
SetMinMax(other.min(), other.max());
}
@@ -609,9 +618,10 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
}
if (HasNullCount()) {
s.set_null_count(this->null_count());
+ // num_values_ is reliable and it means number of non-null values.
+ s.all_null_value = num_values_ == 0;
}
- // num_values_ is reliable and it means number of non-null values.
- s.all_null_value = num_values_ == 0;
+ // TODO (GH-36505): distinct count is not encoded for now.
return s;
}
@@ -627,7 +637,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
T min_;
T max_;
::arrow::MemoryPool* pool_;
- int64_t num_values_ = 0; // # of non-null values.
+ // Number of non-null values.
+ // Please note that num_values_ is reliable when has_null_count_ is set.
+ // When has_null_count_ is not set, e.g. a page statistics created from
+ // a statistics thrift message which doesn't have the optional null_count,
+ // `num_values_` may include null values.
+ int64_t num_values_ = 0;
EncodedStatistics statistics_;
std::shared_ptr<TypedComparator<DType>> comparator_;
std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
@@ -637,8 +652,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }
- void IncrementDistinctCount(int64_t n) {
- statistics_.distinct_count += n;
+ void SetDistinctCount(int64_t n) {
+ // distinct count can only be "set", and cannot be incremented.
+ statistics_.distinct_count = n;
has_distinct_count_ = true;
}
@@ -648,6 +664,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
this->num_values_ = 0;
}
+ void ResetHasFlags() {
+ // has_min_max_ will only be set when it meets any valid value.
+ this->has_min_max_ = false;
+ // has_distinct_count_ will only be set once SetDistinctCount()
+ // is called because distinct count calculation is not cheap and
+ // disabled by default.
+ this->has_distinct_count_ = false;
+ // Null count calculation is cheap and enabled by default.
+ this->has_null_count_ = true;
+ }
+
void SetMinMaxPair(std::pair<T, T> min_max) {
// CleanStatistic can return a nullopt in case of erroneous values, e.g.
NaN
auto maybe_min_max = CleanStatistic(min_max);
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 3f168a938f..ae6c1ca29b 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -117,17 +117,16 @@ std::shared_ptr<TypedComparator<DType>>
MakeComparator(const ColumnDescriptor* d
// ----------------------------------------------------------------------
/// \brief Structure represented encoded statistics to be written to
-/// and from Parquet serialized metadata
+/// and read from Parquet serialized metadata.
class PARQUET_EXPORT EncodedStatistics {
- std::shared_ptr<std::string> max_, min_;
+ std::string max_, min_;
bool is_signed_ = false;
public:
- EncodedStatistics()
- : max_(std::make_shared<std::string>()),
min_(std::make_shared<std::string>()) {}
+ EncodedStatistics() = default;
- const std::string& max() const { return *max_; }
- const std::string& min() const { return *min_; }
+ const std::string& max() const { return max_; }
+ const std::string& min() const { return min_; }
int64_t null_count = 0;
int64_t distinct_count = 0;
@@ -149,11 +148,13 @@ class PARQUET_EXPORT EncodedStatistics {
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
void ApplyStatSizeLimits(size_t length) {
- if (max_->length() > length) {
+ if (max_.length() > length) {
has_max = false;
+ max_.clear();
}
- if (min_->length() > length) {
+ if (min_.length() > length) {
has_min = false;
+ min_.clear();
}
}
@@ -165,14 +166,14 @@ class PARQUET_EXPORT EncodedStatistics {
void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
- EncodedStatistics& set_max(const std::string& value) {
- *max_ = value;
+ EncodedStatistics& set_max(std::string value) {
+ max_ = std::move(value);
has_max = true;
return *this;
}
- EncodedStatistics& set_min(const std::string& value) {
- *min_ = value;
+ EncodedStatistics& set_min(std::string value) {
+ min_ = std::move(value);
has_min = true;
return *this;
}
@@ -329,7 +330,7 @@ class TypedStatistics : public Statistics {
/// null count is determined from the indices)
virtual void IncrementNullCount(int64_t n) = 0;
- /// \brief Increments the number ov values directly
+ /// \brief Increments the number of values directly
/// The same note on IncrementNullCount applies here
virtual void IncrementNumValues(int64_t n) = 0;
};
diff --git a/cpp/src/parquet/statistics_test.cc
b/cpp/src/parquet/statistics_test.cc
index 8b9a42aa18..76667f0029 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -346,6 +346,9 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
ASSERT_EQ(this->values_.size(), statistics->num_values());
statistics->Reset();
+ ASSERT_TRUE(statistics->HasNullCount());
+ ASSERT_FALSE(statistics->HasMinMax());
+ ASSERT_FALSE(statistics->HasDistinctCount());
ASSERT_EQ(0, statistics->null_count());
ASSERT_EQ(0, statistics->num_values());
ASSERT_EQ(0, statistics->distinct_count());
@@ -590,6 +593,178 @@ TYPED_TEST(TestNumericStatistics, Equals) {
ASSERT_NO_FATAL_FAILURE(this->TestEquals());
}
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+ void SetUp() override {
+ TestStatistics<TestType>::SetUp();
+ this->SetUpSchema(Repetition::OPTIONAL);
+ }
+
+ std::shared_ptr<TypedStatistics<TestType>> MergedStatistics(
+ const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2) {
+ auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+ chunk_statistics->Merge(stats1);
+ chunk_statistics->Merge(stats2);
+ return chunk_statistics;
+ }
+
+ void VerifyMergedStatistics(
+ const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2,
+ const std::function<void(TypedStatistics<TestType>*)>& test_fn) {
+ ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats1, stats2).get()));
+ ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get()));
+ }
+
+ // Distinct count should set to false when Merge is called.
+ void TestMergeDistinctCount() {
+ // Create a statistics object with distinct count.
+ std::shared_ptr<TypedStatistics<TestType>> statistics1;
+ {
+ EncodedStatistics encoded_statistics1;
+ statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+ /*num_values=*/1000));
+ EXPECT_FALSE(statistics1->HasDistinctCount());
+ }
+
+ // Create a statistics object with distinct count.
+ std::shared_ptr<TypedStatistics<TestType>> statistics2;
+ {
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_distinct_count = true;
+ encoded_statistics2.distinct_count = 500;
+ statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+ /*num_values=*/1000));
+ EXPECT_TRUE(statistics2->HasDistinctCount());
+ }
+
+ VerifyMergedStatistics(*statistics1, *statistics2,
+ [](TypedStatistics<TestType>* merged_statistics) {
+
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+ });
+ }
+
+ // If all values in a page are null or nan, its stats should not set min-max.
+ // Merging its stats with another page having good min-max stats should not
+ // drop the valid min-max from the latter page.
+ void TestMergeMinMax() {
+ this->GenerateData(1000);
+ // Create a statistics object without min-max.
+ std::shared_ptr<TypedStatistics<TestType>> statistics1;
+ {
+ statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+ statistics1->Update(this->values_ptr_, /*num_values=*/0,
+ /*null_count=*/this->values_.size());
+ auto encoded_stats1 = statistics1->Encode();
+ EXPECT_FALSE(statistics1->HasMinMax());
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+ }
+ // Create a statistics object with min-max.
+ std::shared_ptr<TypedStatistics<TestType>> statistics2;
+ {
+ statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+ statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+ auto encoded_stats2 = statistics2->Encode();
+ EXPECT_TRUE(statistics2->HasMinMax());
+ EXPECT_TRUE(encoded_stats2.has_min);
+ EXPECT_TRUE(encoded_stats2.has_max);
+ }
+ VerifyMergedStatistics(*statistics1, *statistics2,
+ [](TypedStatistics<TestType>* merged_statistics) {
+ EXPECT_TRUE(merged_statistics->HasMinMax());
+ EXPECT_TRUE(merged_statistics->Encode().has_min);
+ EXPECT_TRUE(merged_statistics->Encode().has_max);
+ });
+ }
+
+ // Default statistics should have null_count even if no nulls is written.
+ // However, if statistics is created from thrift message, it might not
+ // have null_count. Merging statistics from such page will result in an
+ // invalid null_count as well.
+ void TestMergeNullCount() {
+ this->GenerateData(/*num_values=*/1000);
+
+ // Page should have null-count even if no nulls
+ std::shared_ptr<TypedStatistics<TestType>> statistics1;
+ {
+ statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+ statistics1->Update(this->values_ptr_,
/*num_values=*/this->values_.size(),
+ /*null_count=*/0);
+ auto encoded_stats1 = statistics1->Encode();
+ EXPECT_TRUE(statistics1->HasNullCount());
+ EXPECT_EQ(0, statistics1->null_count());
+ EXPECT_TRUE(statistics1->Encode().has_null_count);
+ }
+ // Merge with null-count should also have null count
+ VerifyMergedStatistics(*statistics1, *statistics1,
+ [](TypedStatistics<TestType>* merged_statistics) {
+ EXPECT_TRUE(merged_statistics->HasNullCount());
+ EXPECT_EQ(0, merged_statistics->null_count());
+ auto encoded = merged_statistics->Encode();
+ EXPECT_TRUE(encoded.has_null_count);
+ EXPECT_EQ(0, encoded.null_count);
+ });
+
+ // When loaded from thrift, might not have null count.
+ std::shared_ptr<TypedStatistics<TestType>> statistics2;
+ {
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_null_count = false;
+ statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+ /*num_values=*/1000));
+ EXPECT_FALSE(statistics2->Encode().has_null_count);
+ EXPECT_FALSE(statistics2->HasNullCount());
+ }
+
+ // Merge without null-count should not have null count
+ VerifyMergedStatistics(*statistics1, *statistics2,
+ [](TypedStatistics<TestType>* merged_statistics) {
+ EXPECT_FALSE(merged_statistics->HasNullCount());
+
EXPECT_FALSE(merged_statistics->Encode().has_null_count);
+ });
+ }
+
+ // statistics.all_null_value is used to build the page index.
+ // If statistics doesn't have null count, all_null_value should be false.
+ void TestMissingNullCount() {
+ EncodedStatistics encoded_statistics;
+ encoded_statistics.has_null_count = false;
+ auto statistics = Statistics::Make(this->schema_.Column(0),
&encoded_statistics,
+ /*num_values=*/1000);
+ auto typed_stats =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics);
+ EXPECT_FALSE(typed_stats->HasNullCount());
+ auto encoded = typed_stats->Encode();
+ EXPECT_FALSE(encoded.all_null_value);
+ EXPECT_FALSE(encoded.has_null_count);
+ EXPECT_FALSE(encoded.has_distinct_count);
+ EXPECT_FALSE(encoded.has_min);
+ EXPECT_FALSE(encoded.has_max);
+ }
+};
+
+TYPED_TEST_SUITE(TestStatisticsHasFlag, Types);
+
+TYPED_TEST(TestStatisticsHasFlag, MergeDistinctCount) {
+ ASSERT_NO_FATAL_FAILURE(this->TestMergeDistinctCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeNullCount) {
+ ASSERT_NO_FATAL_FAILURE(this->TestMergeNullCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeMinMax) {
+ ASSERT_NO_FATAL_FAILURE(this->TestMergeMinMax());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MissingNullCount) {
+ ASSERT_NO_FATAL_FAILURE(this->TestMissingNullCount());
+}
+
// Helper for basic statistics tests below
void AssertStatsSet(const ApplicationVersion& version,
std::shared_ptr<parquet::WriterProperties> props,
@@ -1212,5 +1387,21 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) {
ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
}
+TEST(TestEncodedStatistics, CopySafe) {
+ EncodedStatistics encoded_statistics;
+ encoded_statistics.set_max("abc");
+ encoded_statistics.has_max = true;
+
+ encoded_statistics.set_min("abc");
+ encoded_statistics.has_min = true;
+
+ EncodedStatistics copy_statistics = encoded_statistics;
+ copy_statistics.set_max("abcd");
+ copy_statistics.set_min("a");
+
+ EXPECT_EQ("abc", encoded_statistics.min());
+ EXPECT_EQ("abc", encoded_statistics.max());
+}
+
} // namespace test
} // namespace parquet