pitrou commented on code in PR #46992: URL: https://github.com/apache/arrow/pull/46992#discussion_r2285429962
########## cpp/src/parquet/statistics.h: ########## @@ -373,10 +389,13 @@ std::shared_ptr<TypedStatistics<DType>> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, bool has_null_count, - bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make( - descr, encoded_min, encoded_max, num_values, null_count, distinct_count, - has_min_max, has_null_count, has_distinct_count, pool)); + bool has_distinct_count, std::optional<bool> is_min_value_exact, Review Comment: Similar comment about breaking an existing API. ########## cpp/src/parquet/statistics_test.cc: ########## @@ -320,9 +320,9 @@ class TestStatistics : public PrimitiveTypedTest<TestType> { std::string encoded_min = statistics1->EncodeMin(); std::string encoded_max = statistics1->EncodeMax(); - auto statistics2 = - MakeStatistics<TestType>(this->schema_.Column(0), encoded_min, encoded_max, - this->values_.size(), 0, 0, true, true, true); + auto statistics2 = MakeStatistics<TestType>(this->schema_.Column(0), encoded_min, + encoded_max, this->values_.size(), 0, 0, + true, true, true, true, true); Review Comment: This isn't readable at all, we want to add the parameter names for clarity: ```suggestion auto statistics2 = MakeStatistics<TestType>(this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0, /*xxx=*/ true, /*etc.*/ true, true, true, true); ``` ########## cpp/src/parquet/statistics_test.cc: ########## @@ -1598,31 +1642,104 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) { ASSERT_EQ(12, stats->num_values()); ASSERT_EQ(0x00, stats->EncodeMin()[0]); ASSERT_EQ(0x0b, stats->EncodeMax()[0]); + std::shared_ptr<EncodedStatistics> enc_stats = column_chunk->encoded_statistics(); + ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); + ASSERT_FALSE(enc_stats->is_min_value_exact.has_value()); +} + +// Test statistics for binary column with truncated max and min values +TEST(TestStatisticsTruncatedMinMax, Unsigned) { Review Comment: Why is this test called "Unsigned"? ########## cpp/src/parquet/statistics_test.cc: ########## @@ -336,10 +336,14 @@ class TestStatistics : public PrimitiveTypedTest<TestType> { ASSERT_EQ(encoded_max, statistics2->EncodeMax()); ASSERT_EQ(statistics1->min(), statistics2->min()); ASSERT_EQ(statistics1->max(), statistics2->max()); + ASSERT_EQ(statistics1->is_min_value_exact(), statistics2->is_min_value_exact()); + ASSERT_EQ(statistics1->is_max_value_exact(), statistics2->is_max_value_exact()); Review Comment: Can we test the actual values here? (I presume `true`?) ########## cpp/src/parquet/statistics.h: ########## @@ -215,12 +220,15 @@ class PARQUET_EXPORT Statistics { /// \param[in] has_min_max whether the min/max statistics are set /// \param[in] has_null_count whether the null_count statistics are set /// \param[in] has_distinct_count whether the distinct_count statistics are set + /// \param[in] is_min_value_exact whether the min value is exact + /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr<Statistics> Make( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, bool has_null_count, - bool has_distinct_count, + bool has_distinct_count, std::optional<bool> is_min_value_exact, + std::optional<bool> is_max_value_exact, Review Comment: This changes the function signature and will therefore break compatibility. Do we want to maintain a compatible version of this function alongside? ########## cpp/src/parquet/printer.cc: ########## @@ -337,6 +347,22 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected << R"("Max": ")" << FormatStatValue(descr->physical_type(), max) << "\", " << R"("Min": ")" << FormatStatValue(descr->physical_type(), min) << "\""; + if (stats->is_max_value_exact().has_value()) { + stream << ", " + << R"("IsMaxValueExact": ")" + << (stats->is_max_value_exact().value() ? "true" : "false") << "\""; + } else { + stream << ", " + << R"("IsMaxValueExact": "unknown")"; + } + if (stats->is_min_value_exact().has_value()) { + stream << ", " + << R"("IsMinValueExact": ")" + << (stats->is_min_value_exact().value() ? "true" : "false") << "\""; + } else { + stream << ", " + << R"("IsMinValueExact": "unknown")"; Review Comment: Is the JSON output tested somewhere? I can't see any corresponding change in this PR. ########## cpp/src/parquet/statistics.h: ########## @@ -259,6 +264,14 @@ class PARQUET_EXPORT Statistics { /// \brief Plain-encoded maximum value virtual std::string EncodeMax() const = 0; + /// \brief Return the minimum value exact flag if set. + /// It will be true if there was no truncation. + virtual std::optional<bool> is_min_value_exact() const = 0; Review Comment: I tend to agree with @raulcd on this one. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org