lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r852174446
##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression>
ColumnChunkStatisticsAsExpression(
auto maybe_min = min->CastTo(field->type());
auto maybe_max = max->CastTo(field->type());
if (maybe_min.ok() && maybe_max.ok()) {
- auto col_min = maybe_min.MoveValueUnsafe();
- auto col_max = maybe_max.MoveValueUnsafe();
- if (col_min->Equals(col_max)) {
- return compute::equal(std::move(field_expr),
compute::literal(std::move(col_min)));
+ min = maybe_min.MoveValueUnsafe();
+ max = maybe_max.MoveValueUnsafe();
+
+ compute::Expression range;
+ if (min->Equals(max)) {
+ auto single_value = compute::equal(field_expr,
compute::literal(std::move(min)));
+
+ if (statistics->null_count() == 0) {
+ return compute::and_(single_value, compute::is_valid(field_expr));
+ }
+ return compute::or_(std::move(single_value),
is_null(std::move(field_expr)));
}
auto lower_bound =
- compute::greater_equal(field_expr,
compute::literal(std::move(col_min)));
- auto upper_bound =
- compute::less_equal(std::move(field_expr),
compute::literal(std::move(col_max)));
- return compute::and_(std::move(lower_bound), std::move(upper_bound));
+ compute::greater_equal(field_expr, compute::literal(std::move(min)));
+ auto upper_bound = compute::less_equal(field_expr,
compute::literal(std::move(max)));
+
+ if (statistics->null_count() != 0) {
+ lower_bound = compute::or_(std::move(lower_bound), is_null(field_expr));
+ upper_bound = compute::or_(std::move(upper_bound),
is_null(std::move(field_expr)));
+ return compute::and_(std::move(lower_bound), std::move(upper_bound));
+ }
+ return compute::and_(compute::and_(std::move(lower_bound),
std::move(upper_bound)),
+ compute::is_valid(field_expr));
}
Review Comment:
I believe this is handled at
https://github.com/apache/arrow/blob/fae66cba04aba6528ba7d6a8c225cff24c469ef2/cpp/src/arrow/dataset/file_parquet.cc#L118-L121
Confusingly enough `num_values` does not include nulls. See this test which
covers this already:
https://github.com/apache/arrow/pull/12891/files#diff-d88654840d0432223c1617e8fd9289db0f4e6fff6b34e9f062861ef8eec724fcR256
This writes each record batch to its own row group, so the test would fail
if we didn't generate the proper guarantee for the all-null row group.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]