wgtmac commented on code in PR #46275: URL: https://github.com/apache/arrow/pull/46275#discussion_r2083852072
########## cpp/src/parquet/metadata.cc: ########## @@ -1552,8 +1554,8 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, return true; } - // Unknown sort order has incorrect stats - if (SortOrder::UNKNOWN == sort_order) { + // Unknown sort order has incorrect stats if the min or the max are specified + if (SortOrder::UNKNOWN == sort_order && (statistics.has_min || statistics.has_max)) { Review Comment: Should we just check geometry & geography types here? ########## cpp/src/parquet/statistics.cc: ########## @@ -963,6 +963,89 @@ std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type, return nullptr; } +template <typename DType> +class UnsortedTypedStatisticsImpl : public TypedStatistics<DType> { + public: + using T = typename DType::c_type; + + explicit UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr) : descr_(descr) {} + + UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr, int64_t num_values, + int64_t null_count) + : descr_(descr), num_values_(num_values), null_count_(null_count) {} + + bool HasDistinctCount() const override { return false; }; + bool HasMinMax() const override { return false; } + bool HasNullCount() const override { return true; }; + + int64_t null_count() const override { return null_count_; } + + int64_t distinct_count() const override { return num_values_; } Review Comment: Should we throw or return -1? ########## cpp/src/parquet/statistics.cc: ########## @@ -963,6 +963,89 @@ std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type, return nullptr; } +template <typename DType> +class UnsortedTypedStatisticsImpl : public TypedStatistics<DType> { Review Comment: Yes, the Java impl did this because the spec advises that min/max values are undefined and should not be used in this case. If we go with this approach, perhaps we need to disable page index (at least the column index) to reduce file size. ########## cpp/src/parquet/metadata.cc: ########## @@ -307,8 +307,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats + bool is_geometry = + descr_->logical_type() != nullptr && descr_->logical_type()->is_geometry(); if (!column_metadata_->__isset.statistics || - descr_->sort_order() == SortOrder::UNKNOWN) { + (descr_->sort_order() == SortOrder::UNKNOWN && !is_geometry)) { Review Comment: According to the [spec](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift#L1045), only `INT96` primitive type and `INTERVAL` logical type have `undefined` column order (except complex types like `map`, `list`, `variant`, and geo types). - INT96: use `BINARY_AS_SIGNED_INTEGER_COMPARATOR`: https://github.com/apache/parquet-java/blob/5f079b98e63c814535e8709ab5c6fb672c2aedc5/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java#L366 - INTERVAL: use `UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR` :https://github.com/apache/parquet-java/blob/5f079b98e63c814535e8709ab5c6fb672c2aedc5/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java#L406 However, I think parquet-java does not cleanly implement the column order semantics. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org