This is an automated email from the ASF dual-hosted git repository.
westonpace pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 518fc51e31 GH-28074: [C++][Dataset] Handle NaNs correctly in Parquet
predicate push-down (#15125)
518fc51e31 is described below
commit 518fc51e313c4ed886cb947d71fa3c8cd5a86187
Author: Sanjiban Sengupta <[email protected]>
AuthorDate: Fri Feb 10 00:06:09 2023 +0530
GH-28074: [C++][Dataset] Handle NaNs correctly in Parquet predicate
push-down (#15125)
This PR fixes the issue of handling NaNs in the Parquet predicate push-down.
While computing the valid bounds for a column, if the max or min of the
column is null, the range should ignore that.
* Closes: #28074
Authored-by: Sanjiban Sengupta <[email protected]>
Signed-off-by: Weston Pace <[email protected]>
---
cpp/src/arrow/dataset/file_parquet.cc | 113 +++++++++++++++++++----------
cpp/src/arrow/dataset/file_parquet.h | 3 +
cpp/src/arrow/dataset/file_parquet_test.cc | 16 ++++
cpp/submodules/parquet-testing | 2 +-
testing | 2 +-
5 files changed, 95 insertions(+), 41 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_parquet.cc
b/cpp/src/arrow/dataset/file_parquet.cc
index 698e33ca7a..2ea6f76784 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -98,6 +98,19 @@ Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
return manifest;
}
+bool IsNan(const Scalar& value) {
+ if (value.is_valid) {
+ if (value.type->id() == Type::FLOAT) {
+ const FloatScalar& float_scalar = checked_cast<const
FloatScalar&>(value);
+ return std::isnan(float_scalar.value);
+ } else if (value.type->id() == Type::DOUBLE) {
+ const DoubleScalar& double_scalar = checked_cast<const
DoubleScalar&>(value);
+ return std::isnan(double_scalar.value);
+ }
+ }
+ return false;
+}
+
std::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
const SchemaField& schema_field, const parquet::RowGroupMetaData&
metadata) {
// For the remaining of this function, failure to extract/parse statistics
@@ -112,50 +125,13 @@ std::optional<compute::Expression>
ColumnChunkStatisticsAsExpression(
auto column_metadata = metadata.ColumnChunk(schema_field.column_index);
auto statistics = column_metadata->statistics();
- if (statistics == nullptr) {
- return std::nullopt;
- }
-
const auto& field = schema_field.field;
- auto field_expr = compute::field_ref(field->name());
- // Optimize for corner case where all values are nulls
- if (statistics->num_values() == 0 && statistics->null_count() > 0) {
- return is_null(std::move(field_expr));
- }
-
- std::shared_ptr<Scalar> min, max;
- if (!StatisticsAsScalars(*statistics, &min, &max).ok()) {
+ if (statistics == nullptr) {
return std::nullopt;
}
- auto maybe_min = min->CastTo(field->type());
- auto maybe_max = max->CastTo(field->type());
- if (maybe_min.ok() && maybe_max.ok()) {
- min = maybe_min.MoveValueUnsafe();
- max = maybe_max.MoveValueUnsafe();
-
- if (min->Equals(max)) {
- auto single_value = compute::equal(field_expr,
compute::literal(std::move(min)));
-
- if (statistics->null_count() == 0) {
- return single_value;
- }
- return compute::or_(std::move(single_value),
is_null(std::move(field_expr)));
- }
-
- auto lower_bound =
- compute::greater_equal(field_expr, compute::literal(std::move(min)));
- auto upper_bound = compute::less_equal(field_expr,
compute::literal(std::move(max)));
-
- auto in_range = compute::and_(std::move(lower_bound),
std::move(upper_bound));
- if (statistics->null_count() != 0) {
- return compute::or_(std::move(in_range), compute::is_null(field_expr));
- }
- return in_range;
- }
-
- return std::nullopt;
+ return ParquetFileFragment::EvaluateStatisticsAsExpression(*field,
*statistics);
}
void AddColumnIndices(const SchemaField& schema_field,
@@ -306,6 +282,65 @@ Result<bool> IsSupportedParquetFile(const
ParquetFileFormat& format,
} // namespace
+std::optional<compute::Expression>
ParquetFileFragment::EvaluateStatisticsAsExpression(
+ const Field& field, const parquet::Statistics& statistics) {
+ auto field_expr = compute::field_ref(field.name());
+
+ // Optimize for corner case where all values are nulls
+ if (statistics.num_values() == 0 && statistics.null_count() > 0) {
+ return is_null(std::move(field_expr));
+ }
+
+ std::shared_ptr<Scalar> min, max;
+ if (!StatisticsAsScalars(statistics, &min, &max).ok()) {
+ return std::nullopt;
+ }
+
+ auto maybe_min = min->CastTo(field.type());
+ auto maybe_max = max->CastTo(field.type());
+
+ if (maybe_min.ok() && maybe_max.ok()) {
+ min = maybe_min.MoveValueUnsafe();
+ max = maybe_max.MoveValueUnsafe();
+
+ if (min->Equals(max)) {
+ auto single_value = compute::equal(field_expr,
compute::literal(std::move(min)));
+
+ if (statistics.null_count() == 0) {
+ return single_value;
+ }
+ return compute::or_(std::move(single_value),
is_null(std::move(field_expr)));
+ }
+
+ auto lower_bound = compute::greater_equal(field_expr,
compute::literal(min));
+ auto upper_bound = compute::less_equal(field_expr, compute::literal(max));
+ compute::Expression in_range;
+
+ // Since the minimum & maximum values are NaN, useful statistics
+ // cannot be extracted for checking the presence of a value within
+ // range
+ if (IsNan(*min) && IsNan(*max)) {
+ return std::nullopt;
+ }
+
+ // If either minimum or maximum is NaN, it should be ignored for the
+ // range computation
+ if (IsNan(*min)) {
+ in_range = std::move(upper_bound);
+ } else if (IsNan(*max)) {
+ in_range = std::move(lower_bound);
+ } else {
+ in_range = compute::and_(std::move(lower_bound), std::move(upper_bound));
+ }
+
+ if (statistics.null_count() != 0) {
+ return compute::or_(std::move(in_range), compute::is_null(field_expr));
+ }
+ return in_range;
+ }
+ return std::nullopt;
+}
+
ParquetFileFormat::ParquetFileFormat()
: FileFormat(std::make_shared<ParquetFragmentScanOptions>()) {}
diff --git a/cpp/src/arrow/dataset/file_parquet.h
b/cpp/src/arrow/dataset/file_parquet.h
index ac16123434..f33190bd93 100644
--- a/cpp/src/arrow/dataset/file_parquet.h
+++ b/cpp/src/arrow/dataset/file_parquet.h
@@ -171,6 +171,9 @@ class ARROW_DS_EXPORT ParquetFileFragment : public
FileFragment {
Result<std::shared_ptr<Fragment>> Subset(compute::Expression predicate);
Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);
+ static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
+ const Field& field, const parquet::Statistics& statistics);
+
private:
ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
compute::Expression partition_expression,
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc
b/cpp/src/arrow/dataset/file_parquet_test.cc
index 085508b20d..570e028695 100644
--- a/cpp/src/arrow/dataset/file_parquet_test.cc
+++ b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -33,10 +33,14 @@
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
+#include "arrow/util/io_util.h"
#include "arrow/util/range.h"
#include "parquet/arrow/writer.h"
+#include "parquet/file_reader.h"
#include "parquet/metadata.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
namespace arrow {
@@ -678,5 +682,17 @@ INSTANTIATE_TEST_SUITE_P(TestScan,
TestParquetFileFormatScan,
::testing::ValuesIn(TestFormatParams::Values()),
TestFormatParams::ToTestNameString);
+TEST(TestParquetStatistics, NullMax) {
+ auto field = ::arrow::field("x", float32());
+ ASSERT_OK_AND_ASSIGN(std::string dir_string,
+ arrow::internal::GetEnvVar("PARQUET_TEST_DATA"));
+ auto reader =
+ parquet::ParquetFileReader::OpenFile(dir_string +
"/nan_in_stats.parquet");
+ auto statistics =
reader->RowGroup(0)->metadata()->ColumnChunk(0)->statistics();
+ auto stat_expression =
+ ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics);
+ EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
+}
+
} // namespace dataset
} // namespace arrow
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index e2d244ab9a..33b4e23376 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit e2d244ab9a84d382e3a50f55db41f362e450428b
+Subproject commit 33b4e23376c28e489c6a08b9207829b29e4bffb8
diff --git a/testing b/testing
index ecab1162cb..d2c73bf782 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit ecab1162cbec872e17d949ecc86181670aee045c
+Subproject commit d2c73bf78246331d8e58b6f11aa8aa199cbb5929