sanjibansg commented on code in PR #15125:
URL: https://github.com/apache/arrow/pull/15125#discussion_r1101756899
##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -306,6 +282,65 @@ Result<bool> IsSupportedParquetFile(const
ParquetFileFormat& format,
} // namespace
+std::optional<compute::Expression>
ParquetFileFragment::EvaluateStatisticsAsExpression(
+ const Field& field, const parquet::Statistics& statistics) {
+ auto field_expr = compute::field_ref(field.name());
+
+ // Optimize for corner case where all values are nulls
+ if (statistics.num_values() == 0 && statistics.null_count() > 0) {
+ return is_null(std::move(field_expr));
+ }
+
+ std::shared_ptr<Scalar> min, max;
+ if (!StatisticsAsScalars(statistics, &min, &max).ok()) {
+ return std::nullopt;
+ }
+
+ auto maybe_min = min->CastTo(field.type());
+ auto maybe_max = max->CastTo(field.type());
+
+ if (maybe_min.ok() && maybe_max.ok()) {
+ min = maybe_min.MoveValueUnsafe();
+ max = maybe_max.MoveValueUnsafe();
+
+ // Since the minimum & maximum values are NaN, useful statistics
+ // cannot be extracted for checking the presence of a value within
+ // range
+ if (IsNan(*min) && IsNan(*max)) {
+ return std::nullopt;
+ }
+
+ if (min->Equals(max)) {
+ auto single_value = compute::equal(field_expr,
compute::literal(std::move(min)));
+
+ if (statistics.null_count() == 0) {
+ return single_value;
+ }
+ return compute::or_(std::move(single_value),
is_null(std::move(field_expr)));
+ }
+
+ auto lower_bound = compute::greater_equal(field_expr,
compute::literal(min));
+ auto upper_bound = compute::less_equal(field_expr, compute::literal(max));
+ compute::Expression in_range;
+
+ // If either minimum or maximum is NaN, it should be ignored for the
+ // range computation
+ if (IsNan(*min)) {
Review Comment:
made the change, thanks!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]