pitrou commented on code in PR #43726:
URL: https://github.com/apache/arrow/pull/43726#discussion_r1742034385


##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -366,9 +366,16 @@ std::optional<compute::Expression> 
ParquetFileFragment::EvaluateStatisticsAsExpr
     const parquet::Statistics& statistics) {
   auto field_expr = compute::field_ref(field_ref);
 
+  bool may_has_null = !statistics.HasNullCount() || statistics.null_count() > 
0;
+  bool must_has_null = statistics.HasNullCount() && statistics.null_count() > 
0;

Review Comment:
   ```suggestion
     bool may_have_null = !statistics.HasNullCount() || statistics.null_count() 
> 0;
     bool has_null = statistics.HasNullCount() && statistics.null_count() > 0;
   ```



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -366,9 +366,16 @@ std::optional<compute::Expression> 
ParquetFileFragment::EvaluateStatisticsAsExpr
     const parquet::Statistics& statistics) {
   auto field_expr = compute::field_ref(field_ref);
 
+  bool may_has_null = !statistics.HasNullCount() || statistics.null_count() > 
0;
+  bool must_has_null = statistics.HasNullCount() && statistics.null_count() > 
0;
   // Optimize for corner case where all values are nulls
-  if (statistics.num_values() == 0 && statistics.null_count() > 0) {
-    return is_null(std::move(field_expr));
+  if (statistics.num_values() == 0) {
+    if (must_has_null) {
+      return is_null(std::move(field_expr));
+    }
+    // If there are no values and no nulls, it might be empty or contains
+    // only null.
+    return std::nullopt;

Review Comment:
   This seems a bit weird (though it's certainly safe to return `std::nullopt`).
   If `num_values() == 0`, there can be only nulls, so we can just return 
`is_null(std::move(field_expr))` too?
   



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -412,8 +418,7 @@ std::optional<compute::Expression> 
ParquetFileFragment::EvaluateStatisticsAsExpr
     } else {
       in_range = compute::and_(std::move(lower_bound), std::move(upper_bound));
     }
-
-    if (statistics.null_count() != 0) {
+    if (may_has_null) {
       return compute::or_(std::move(in_range), compute::is_null(field_expr));

Review Comment:
   Nit
   ```suggestion
         return compute::or_(std::move(in_range), 
compute::is_null(std::move(field_expr)));
   ```



##########
cpp/src/arrow/dataset/file_parquet_test.cc:
##########
@@ -841,6 +841,53 @@ TEST(TestParquetStatistics, NullMax) {
   EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
 }
 
+TEST(TestParquetStatistics, NoNullCount) {
+  auto field = ::arrow::field("x", int32());
+  auto parquet_node_ptr = ::parquet::schema::Int32("x", 
::parquet::Repetition::REQUIRED);
+  ::parquet::ColumnDescriptor descr(parquet_node_ptr, 
/*max_definition_level=*/1,
+                                    /*max_repetition_level=*/0);
+
+  auto int32_to_parquet_stats = [](int32_t v) {
+    std::string value;
+    value.resize(sizeof(int32_t));
+    memcpy(value.data(), &v, sizeof(int32_t));
+    return value;
+  };
+  {
+    // Base case: when null_count is not set, the expression might contain null
+    ::parquet::EncodedStatistics encoded_stats;
+    encoded_stats.set_min(int32_to_parquet_stats(1));
+    encoded_stats.set_max(int32_to_parquet_stats(100));
+    encoded_stats.has_null_count = false;
+    encoded_stats.all_null_value = false;
+    encoded_stats.null_count = 0;
+    auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, 
/*num_values=*/10);
+
+    auto stat_expression =
+        ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
+    EXPECT_EQ(stat_expression->ToString(),
+              "(((x >= 1) and (x <= 100)) or is_null(x, 
{nan_is_null=false}))");
+  }
+  {
+    // Special case: when num_value is 0, if has_null, it would return
+    // "is_null", otherwise it cannot gurantees anything

Review Comment:
   I don't understand why it cannot guarantee anything. It _knows_ that there 
are no non-null values, so `is_null` is true for all values (even if there are 
no values at all).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to