pitrou commented on code in PR #50157:
URL: https://github.com/apache/arrow/pull/50157#discussion_r3394337378
##########
cpp/src/parquet/metadata.cc:
##########
@@ -90,36 +90,111 @@ std::string ParquetVersionToString(ParquetVersion::type
ver) {
return "UNKNOWN";
}
+namespace {
+
+enum class StatsMinMaxMode {
+ // Ignore min/max fields because their ordering is unknown or unsupported.
+ kDiscard,
+ // Use legacy min/max fields for files without column orders.
+ kLegacy,
+ // Use min_value/max_value fields with the column's well-defined order.
+ kNormal,
+};
+
+inline StatsMinMaxMode GetStatsMinMaxMode(const ColumnDescriptor& descr) {
+ switch (descr.column_order().get_order()) {
+ case ColumnOrder::TYPE_DEFINED_ORDER:
+ return descr.sort_order() != SortOrder::UNKNOWN ?
StatsMinMaxMode::kNormal
+ :
StatsMinMaxMode::kDiscard;
+ case ColumnOrder::UNDEFINED:
+ return descr.sort_order() != SortOrder::UNKNOWN ?
StatsMinMaxMode::kLegacy
+ :
StatsMinMaxMode::kDiscard;
+ case ColumnOrder::UNKNOWN:
+ return StatsMinMaxMode::kDiscard;
+ }
+ return StatsMinMaxMode::kDiscard;
+}
+
+} // namespace
+
+static EncodedStatistics EncodedStatisticsFromThrift(const format::Statistics&
statistics,
Review Comment:
Just put this in the anonymous namespace above instead of marking it
`static`.
##########
cpp/src/parquet/page_index.cc:
##########
@@ -973,6 +979,9 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const
ColumnDescriptor& descr,
// Guard against UB when moving column_index
throw ParquetException("Invalid ColumnIndex boundary_order");
}
+ if (!CanTrustPageIndexMinMax(descr)) {
+ return nullptr;
Review Comment:
It doesn't seem right for a factory function to be allowed to return NULL.
It's also an API change.
##########
cpp/src/parquet/page_index.cc:
##########
@@ -973,6 +979,9 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const
ColumnDescriptor& descr,
// Guard against UB when moving column_index
throw ParquetException("Invalid ColumnIndex boundary_order");
}
+ if (!CanTrustPageIndexMinMax(descr)) {
+ return nullptr;
Review Comment:
But I think the bottom line is that the `ColumnIndex` object only gives
access to raw encoded statistics anyway. It's up to the caller to decide if
they know how to handle them. So why check this here?
Instead, perhaps `ColumnDescriptor` can expose a method `can_use_statistics`
or something similar.
##########
cpp/src/parquet/page_index.cc:
##########
@@ -973,6 +979,9 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const
ColumnDescriptor& descr,
// Guard against UB when moving column_index
throw ParquetException("Invalid ColumnIndex boundary_order");
}
+ if (!CanTrustPageIndexMinMax(descr)) {
+ return nullptr;
Review Comment:
Note we're raising an exception if we encounter an unknown boundary order
above.
##########
cpp/src/parquet/metadata.cc:
##########
@@ -90,36 +90,111 @@ std::string ParquetVersionToString(ParquetVersion::type
ver) {
return "UNKNOWN";
}
+namespace {
+
+enum class StatsMinMaxMode {
+ // Ignore min/max fields because their ordering is unknown or unsupported.
+ kDiscard,
+ // Use legacy min/max fields for files without column orders.
+ kLegacy,
+ // Use min_value/max_value fields with the column's well-defined order.
+ kNormal,
+};
+
+StatsMinMaxMode GetStatsMinMaxMode(const ColumnDescriptor& descr) {
+ switch (descr.column_order().get_order()) {
+ case ColumnOrder::TYPE_DEFINED_ORDER:
+ return descr.sort_order() != SortOrder::UNKNOWN ?
StatsMinMaxMode::kNormal
+ :
StatsMinMaxMode::kDiscard;
+ case ColumnOrder::UNDEFINED:
+ return descr.sort_order() != SortOrder::UNKNOWN ?
StatsMinMaxMode::kLegacy
+ :
StatsMinMaxMode::kDiscard;
+ case ColumnOrder::UNKNOWN:
+ return StatsMinMaxMode::kDiscard;
+ }
+ return StatsMinMaxMode::kDiscard;
+}
+
+} // namespace
+
+static EncodedStatistics EncodedStatisticsFromThrift(const format::Statistics&
statistics,
+ StatsMinMaxMode min_max) {
Review Comment:
Also, can we reconcile this with [the existing function in
`thrift_internal.h`](https://github.com/apache/arrow/blob/ed536d18f7afc2c2964211c3625ffd29bf3e8ded/cpp/src/parquet/thrift_internal.h#L255-L291)?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]