pitrou commented on code in PR #43726:
URL: https://github.com/apache/arrow/pull/43726#discussion_r1742034385
##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -366,9 +366,16 @@ std::optional<compute::Expression>
ParquetFileFragment::EvaluateStatisticsAsExpr
const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);
+ bool may_has_null = !statistics.HasNullCount() || statistics.null_count() >
0;
+ bool must_has_null = statistics.HasNullCount() && statistics.null_count() >
0;
Review Comment:
```suggestion
bool may_have_null = !statistics.HasNullCount() || statistics.null_count()
> 0;
bool has_null = statistics.HasNullCount() && statistics.null_count() > 0;
```
##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -366,9 +366,16 @@ std::optional<compute::Expression>
ParquetFileFragment::EvaluateStatisticsAsExpr
const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);
+ bool may_has_null = !statistics.HasNullCount() || statistics.null_count() >
0;
+ bool must_has_null = statistics.HasNullCount() && statistics.null_count() >
0;
// Optimize for corner case where all values are nulls
- if (statistics.num_values() == 0 && statistics.null_count() > 0) {
- return is_null(std::move(field_expr));
+ if (statistics.num_values() == 0) {
+ if (must_has_null) {
+ return is_null(std::move(field_expr));
+ }
+ // If there are no values and no nulls, it might be empty or contains
+ // only null.
+ return std::nullopt;
Review Comment:
This seems a bit weird (though it's certainly safe to return `std::nullopt`).
If `num_values() == 0`, there can be only nulls, so we can just return
`is_null(std::move(field_expr))` too?
##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -412,8 +418,7 @@ std::optional<compute::Expression>
ParquetFileFragment::EvaluateStatisticsAsExpr
} else {
in_range = compute::and_(std::move(lower_bound), std::move(upper_bound));
}
-
- if (statistics.null_count() != 0) {
+ if (may_has_null) {
return compute::or_(std::move(in_range), compute::is_null(field_expr));
Review Comment:
Nit
```suggestion
return compute::or_(std::move(in_range),
compute::is_null(std::move(field_expr)));
```
##########
cpp/src/arrow/dataset/file_parquet_test.cc:
##########
@@ -841,6 +841,53 @@ TEST(TestParquetStatistics, NullMax) {
EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
}
+TEST(TestParquetStatistics, NoNullCount) {
+ auto field = ::arrow::field("x", int32());
+ auto parquet_node_ptr = ::parquet::schema::Int32("x",
::parquet::Repetition::REQUIRED);
+ ::parquet::ColumnDescriptor descr(parquet_node_ptr,
/*max_definition_level=*/1,
+ /*max_repetition_level=*/0);
+
+ auto int32_to_parquet_stats = [](int32_t v) {
+ std::string value;
+ value.resize(sizeof(int32_t));
+ memcpy(value.data(), &v, sizeof(int32_t));
+ return value;
+ };
+ {
+ // Base case: when null_count is not set, the expression might contain null
+ ::parquet::EncodedStatistics encoded_stats;
+ encoded_stats.set_min(int32_to_parquet_stats(1));
+ encoded_stats.set_max(int32_to_parquet_stats(100));
+ encoded_stats.has_null_count = false;
+ encoded_stats.all_null_value = false;
+ encoded_stats.null_count = 0;
+ auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats,
/*num_values=*/10);
+
+ auto stat_expression =
+ ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
+ EXPECT_EQ(stat_expression->ToString(),
+ "(((x >= 1) and (x <= 100)) or is_null(x,
{nan_is_null=false}))");
+ }
+ {
+ // Special case: when num_value is 0, if has_null, it would return
+ // "is_null", otherwise it cannot gurantees anything
Review Comment:
I don't understand why it cannot guarantee anything. It _knows_ that there
are no non-null values, so `is_null` is true for all values (even if there are
no values at all).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]