This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 09f26b66 feat: add strict metrics evaluator (#383)
09f26b66 is described below
commit 09f26b664e7c04628f8e6fdd272eedcc99ae107a
Author: dongxiao <[email protected]>
AuthorDate: Fri Dec 5 11:39:45 2025 +0800
feat: add strict metrics evaluator (#383)
---
src/iceberg/CMakeLists.txt | 1 +
src/iceberg/catalog/rest/meson.build | 1 -
src/iceberg/expression/meson.build | 4 +
src/iceberg/expression/strict_metrics_evaluator.cc | 506 ++++++++++++
src/iceberg/expression/strict_metrics_evaluator.h | 79 ++
src/iceberg/meson.build | 1 +
src/iceberg/test/CMakeLists.txt | 3 +-
src/iceberg/test/meson.build | 1 +
src/iceberg/test/strict_metrics_evaluator_test.cc | 849 +++++++++++++++++++++
9 files changed, 1443 insertions(+), 2 deletions(-)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 369666b7..275d71fc 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -29,6 +29,7 @@ set(ICEBERG_SOURCES
expression/literal.cc
expression/predicate.cc
expression/rewrite_not.cc
+ expression/strict_metrics_evaluator.cc
expression/term.cc
file_reader.cc
file_writer.cc
diff --git a/src/iceberg/catalog/rest/meson.build
b/src/iceberg/catalog/rest/meson.build
index 89a68850..8378b2a8 100644
--- a/src/iceberg/catalog/rest/meson.build
+++ b/src/iceberg/catalog/rest/meson.build
@@ -61,7 +61,6 @@ install_headers(
'error_handlers.h',
'http_client.h',
'iceberg_rest_export.h',
- 'json_internal.h',
'resource_paths.h',
'rest_catalog.h',
'rest_util.h',
diff --git a/src/iceberg/expression/meson.build
b/src/iceberg/expression/meson.build
index 83005908..8e312791 100644
--- a/src/iceberg/expression/meson.build
+++ b/src/iceberg/expression/meson.build
@@ -17,13 +17,17 @@
install_headers(
[
+ 'aggregate.h',
'binder.h',
+ 'evaluator.h',
'expression.h',
'expression_visitor.h',
'expressions.h',
+ 'inclusive_metrics_evaluator.h',
'literal.h',
'predicate.h',
'rewrite_not.h',
+ 'strict_metrics_evaluator.h',
'term.h',
],
subdir: 'iceberg/expression',
diff --git a/src/iceberg/expression/strict_metrics_evaluator.cc
b/src/iceberg/expression/strict_metrics_evaluator.cc
new file mode 100644
index 00000000..e2fe34f1
--- /dev/null
+++ b/src/iceberg/expression/strict_metrics_evaluator.cc
@@ -0,0 +1,506 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/expression/strict_metrics_evaluator.h"
+
+#include "iceberg/expression/binder.h"
+#include "iceberg/expression/expression_visitor.h"
+#include "iceberg/expression/rewrite_not.h"
+#include "iceberg/expression/term.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/schema.h"
+#include "iceberg/type.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+namespace {
+constexpr bool kRowsMustMatch = true;
+constexpr bool kRowsMightNotMatch = false;
+} // namespace
+
+// If the term in any expression is not a direct reference, assume that rows
may not
+// match. This happens when transforms or other expressions are passed to this
evaluator.
+// For example, bucket16(x) = 0 can't be determined because this visitor
operates on data
+// metrics and not partition values. It may be possible to un-transform
expressions for
+// order preserving transforms in the future, but this is not currently
supported.
+#define RETURN_IF_NOT_REFERENCE(expr) \
+ if (auto ref = dynamic_cast<BoundReference*>(expr.get()); ref == nullptr) { \
+ return kRowsMightNotMatch; \
+ }
+
+class StrictMetricsVisitor : public BoundVisitor<bool> {
+ public:
+ explicit StrictMetricsVisitor(const DataFile& data_file, const Schema&
schema)
+ : data_file_(data_file), schema_(schema) {}
+
+ Result<bool> AlwaysTrue() override { return kRowsMustMatch; }
+
+ Result<bool> AlwaysFalse() override { return kRowsMightNotMatch; }
+
+ Result<bool> Not(bool child_result) override { return !child_result; }
+
+ Result<bool> And(bool left_result, bool right_result) override {
+ return left_result && right_result;
+ }
+
+ Result<bool> Or(bool left_result, bool right_result) override {
+ return left_result || right_result;
+ }
+
+ Result<bool> IsNull(const std::shared_ptr<Bound>& expr) override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // no need to check whether the field is required because binding
evaluates that case
+ // if the column has any non-null values, the expression does not match
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (ContainsNullsOnly(id)) {
+ return kRowsMustMatch;
+ }
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> NotNull(const std::shared_ptr<Bound>& expr) override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // no need to check whether the field is required because binding
evaluates that case
+ // if the column has any null values, the expression does not match
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ auto it = data_file_.null_value_counts.find(id);
+ if (it != data_file_.null_value_counts.cend() && it->second == 0) {
+ return kRowsMustMatch;
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> IsNaN(const std::shared_ptr<Bound>& expr) override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ int32_t id = expr->reference()->field().field_id();
+
+ if (ContainsNaNsOnly(id)) {
+ return kRowsMustMatch;
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> NotNaN(const std::shared_ptr<Bound>& expr) override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ int32_t id = expr->reference()->field().field_id();
+
+ auto it = data_file_.nan_value_counts.find(id);
+ if (it != data_file_.nan_value_counts.cend() && it->second == 0) {
+ return kRowsMustMatch;
+ }
+
+ if (ContainsNullsOnly(id)) {
+ return kRowsMustMatch;
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> Lt(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // Rows must match when: <----------Min----Max---X------->
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (CanContainNulls(id) || CanContainNaNs(id)) {
+ return kRowsMightNotMatch;
+ }
+
+ auto it = data_file_.upper_bounds.find(id);
+ if (it != data_file_.upper_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, it->second));
+ if (upper < lit) {
+ return kRowsMustMatch;
+ }
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> LtEq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // Rows must match when: <----------Min----Max---X------->
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (CanContainNulls(id) || CanContainNaNs(id)) {
+ return kRowsMightNotMatch;
+ }
+
+ auto it = data_file_.upper_bounds.find(id);
+ if (it != data_file_.upper_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, it->second));
+ if (upper <= lit) {
+ return kRowsMustMatch;
+ }
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> Gt(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // Rows must match when: <-------X---Min----Max---------->
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (CanContainNulls(id) || CanContainNaNs(id)) {
+ return kRowsMightNotMatch;
+ }
+
+ auto it = data_file_.lower_bounds.find(id);
+ if (it != data_file_.lower_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, it->second));
+ if (lower.IsNaN()) {
+ // NaN indicates unreliable bounds. See the StrictMetricsEvaluator
docs for
+ // more.
+ return kRowsMightNotMatch;
+ }
+
+ if (lower > lit) {
+ return kRowsMustMatch;
+ }
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> GtEq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // Rows must match when: <-------X---Min----Max---------->
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (CanContainNulls(id) || CanContainNaNs(id)) {
+ return kRowsMightNotMatch;
+ }
+
+ auto it = data_file_.lower_bounds.find(id);
+ if (it != data_file_.lower_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, it->second));
+ if (lower.IsNaN()) {
+ // NaN indicates unreliable bounds. See the StrictMetricsEvaluator
docs for
+ // more.
+ return kRowsMightNotMatch;
+ }
+
+ if (lower >= lit) {
+ return kRowsMustMatch;
+ }
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> Eq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // Rows must match when Min == X == Max
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (CanContainNulls(id) || CanContainNaNs(id)) {
+ return kRowsMightNotMatch;
+ }
+ auto lower_it = data_file_.lower_bounds.find(id);
+ auto upper_it = data_file_.upper_bounds.find(id);
+ if (lower_it != data_file_.lower_bounds.cend() &&
+ upper_it != data_file_.upper_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+ if (lower != lit) {
+ return kRowsMightNotMatch;
+ }
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+ if (upper != lit) {
+ return kRowsMightNotMatch;
+ }
+
+ return kRowsMustMatch;
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> NotEq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ // Rows must match when X < Min or Max < X because it is not in the range
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowsMustMatch;
+ }
+
+ auto lower_it = data_file_.lower_bounds.find(id);
+ if (lower_it != data_file_.lower_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+ if (lower.IsNaN()) {
+ // NaN indicates unreliable bounds. See the StrictMetricsEvaluator
docs for
+ // more.
+ return kRowsMightNotMatch;
+ }
+ if (lower > lit) {
+ return kRowsMustMatch;
+ }
+ }
+
+ auto upper_it = data_file_.upper_bounds.find(id);
+ if (upper_it != data_file_.upper_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+ if (upper < lit) {
+ return kRowsMustMatch;
+ }
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> In(const std::shared_ptr<Bound>& expr,
+ const BoundSetPredicate::LiteralSet& literal_set) override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (CanContainNulls(id) || CanContainNaNs(id)) {
+ return kRowsMightNotMatch;
+ }
+ auto lower_it = data_file_.lower_bounds.find(id);
+ auto upper_it = data_file_.upper_bounds.find(id);
+ if (lower_it != data_file_.lower_bounds.cend() &&
+ upper_it != data_file_.upper_bounds.cend()) {
+ // similar to the implementation in eq, first check if the lower bound
is in the
+ // set
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+ if (!literal_set.contains(lower)) {
+ return kRowsMightNotMatch;
+ }
+ // check if the upper bound is in the set
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+ if (!literal_set.contains(upper)) {
+ return kRowsMightNotMatch;
+ }
+ // finally check if the lower bound and the upper bound are equal
+ if (lower != upper) {
+ return kRowsMightNotMatch;
+ }
+
+ // All values must be in the set if the lower bound and the upper bound
are in the
+ // set and are equal.
+ return kRowsMustMatch;
+ }
+
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> NotIn(const std::shared_ptr<Bound>& expr,
+ const BoundSetPredicate::LiteralSet& literal_set)
override {
+ RETURN_IF_NOT_REFERENCE(expr);
+
+ int32_t id = expr->reference()->field().field_id();
+
+ ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+ if (is_nested) {
+ return kRowsMightNotMatch;
+ }
+
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowsMustMatch;
+ }
+ std::optional<Literal> lower_bound;
+ auto lower_it = data_file_.lower_bounds.find(id);
+ if (lower_it != data_file_.lower_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+ if (lower.IsNaN()) {
+ // NaN indicates unreliable bounds. See the StrictMetricsEvaluator
docs for
+ // more.
+ return kRowsMightNotMatch;
+ }
+ lower_bound = std::move(lower);
+ }
+ auto literals_view = literal_set | std::views::filter([&](const Literal&
lit) {
+ return lower_bound.has_value() &&
lower_bound.value() <= lit;
+ });
+ // if all values are less than lower bound, rows must
+ // match (notIn).
+ if (lower_bound.has_value() && literals_view.empty()) {
+ return kRowsMustMatch;
+ }
+
+ auto upper_it = data_file_.upper_bounds.find(id);
+ if (upper_it != data_file_.upper_bounds.cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+ auto filtered_view = literals_view | std::views::filter([&](const
Literal& lit) {
+ return upper >= lit;
+ });
+ if (filtered_view.empty()) {
+ // if all remaining values are greater than upper bound,
+ // rows must match
+ // (notIn).
+ return kRowsMustMatch;
+ }
+ }
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> StartsWith(const std::shared_ptr<Bound>& expr,
+ const Literal& lit) override {
+ return kRowsMightNotMatch;
+ }
+
+ Result<bool> NotStartsWith(const std::shared_ptr<Bound>& expr,
+ const Literal& lit) override {
+ // TODO(xiao.dong) Handle cases that definitely cannot match,
+ // such as notStartsWith("x") when
+ // the bounds are ["a", "b"].
+ return kRowsMightNotMatch;
+ }
+
+ private:
+ Result<Literal> ParseBound(const std::shared_ptr<Bound>& expr,
+ const std::vector<uint8_t>& stats) {
+ auto type = expr->reference()->type();
+ if (!type->is_primitive()) {
+ return NotSupported("Bound of non-primitive type is not supported.");
+ }
+ auto primitive_type = internal::checked_pointer_cast<PrimitiveType>(type);
+ return Literal::Deserialize(stats, primitive_type);
+ }
+
+ bool CanContainNulls(int32_t id) {
+ if (data_file_.null_value_counts.empty()) {
+ return true;
+ }
+ auto it = data_file_.null_value_counts.find(id);
+ return it != data_file_.null_value_counts.cend() && it->second > 0;
+ }
+
+ bool CanContainNaNs(int32_t id) {
+ // nan counts might be null for early version writers when nan counters
are not
+ // populated.
+ auto it = data_file_.nan_value_counts.find(id);
+ return it != data_file_.nan_value_counts.cend() && it->second > 0;
+ }
+
+ bool ContainsNullsOnly(int32_t id) {
+ auto val_it = data_file_.value_counts.find(id);
+ auto null_it = data_file_.null_value_counts.find(id);
+ return val_it != data_file_.value_counts.cend() &&
+ null_it != data_file_.null_value_counts.cend() &&
+ val_it->second == null_it->second;
+ }
+
+ bool ContainsNaNsOnly(int32_t id) {
+ auto val_it = data_file_.value_counts.find(id);
+ auto nan_it = data_file_.nan_value_counts.find(id);
+ return val_it != data_file_.value_counts.cend() &&
+ nan_it != data_file_.nan_value_counts.cend() &&
+ val_it->second == nan_it->second;
+ }
+
+ Result<bool> IsNestedColumn(int32_t id) {
+ // XXX: null_count might be missing from nested columns but required by
+ // StrictMetricsEvaluator.
+ // See https://github.com/apache/iceberg/pull/11261.
+ ICEBERG_ASSIGN_OR_RAISE(auto field, schema_.GetFieldById(id));
+ return !field.has_value() || field->get().type()->is_nested();
+ }
+
+ private:
+ const DataFile& data_file_;
+ const Schema& schema_;
+};
+
+StrictMetricsEvaluator::StrictMetricsEvaluator(std::shared_ptr<Expression>
expr,
+ std::shared_ptr<Schema> schema)
+ : expr_(std::move(expr)), schema_(std::move(schema)) {}
+
+StrictMetricsEvaluator::~StrictMetricsEvaluator() = default;
+
+Result<std::unique_ptr<StrictMetricsEvaluator>> StrictMetricsEvaluator::Make(
+ std::shared_ptr<Expression> expr, std::shared_ptr<Schema> schema,
+ bool case_sensitive) {
+ ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr,
RewriteNot::Visit(std::move(expr)));
+ ICEBERG_ASSIGN_OR_RAISE(auto bound_expr,
+ Binder::Bind(*schema, rewrite_expr, case_sensitive));
+ return std::unique_ptr<StrictMetricsEvaluator>(
+ new StrictMetricsEvaluator(std::move(bound_expr), std::move(schema)));
+}
+
+Result<bool> StrictMetricsEvaluator::Evaluate(const DataFile& data_file) const
{
+ if (data_file.record_count <= 0) {
+ return kRowsMustMatch;
+ }
+ StrictMetricsVisitor visitor(data_file, *schema_);
+ return Visit<bool, StrictMetricsVisitor>(expr_, visitor);
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/expression/strict_metrics_evaluator.h
b/src/iceberg/expression/strict_metrics_evaluator.h
new file mode 100644
index 00000000..60dc74a9
--- /dev/null
+++ b/src/iceberg/expression/strict_metrics_evaluator.h
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/expression/strict_metrics_evaluator.h
+///
+/// Evaluates an Expression on a DataFile to test whether all rows in the file
match.
+///
+/// This evaluation is strict: it returns true if all rows in a file must
match the
+/// expression. For example, if a file's ts column has min X and max Y, this
evaluator
+/// will return true for ts < Y+1 but not for ts < Y-1.
+///
+/// Files are passed to #eval(ContentFile), which returns true if all rows in
the file
+/// must contain matching rows and false if the file may contain rows that do
not match.
+///
+/// Due to the comparison implementation of ORC stats, for float/double
columns in ORC
+/// files, if the first value in a file is NaN, metrics of this file will
report NaN for
+/// both upper and lower bound despite that the column could contain non-NaN
data. Thus in
+/// some scenarios explicitly checks for NaN is necessary in order to not
include files
+/// that may contain rows that don't match.
+///
+
+#include <memory>
+
+#include "iceberg/expression/expression.h"
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+namespace iceberg {
+
+/// \brief Evaluates an Expression against DataFile.
+/// \note: The evaluator is thread-safe.
+class ICEBERG_EXPORT StrictMetricsEvaluator {
+ public:
+ /// \brief Make a strict metrics evaluator
+ ///
+ /// \param expr The expression to evaluate
+ /// \param schema The schema of the table
+ /// \param case_sensitive Whether field name matching is case-sensitive
+ static Result<std::unique_ptr<StrictMetricsEvaluator>> Make(
+ std::shared_ptr<Expression> expr, std::shared_ptr<Schema> schema,
+ bool case_sensitive = true);
+
+ ~StrictMetricsEvaluator();
+
+ /// \brief Evaluate the expression against a DataFile.
+ ///
+ /// \param data_file The data file to evaluate
+ /// \return true if the file matches the expression, false otherwise, or
error
+ Result<bool> Evaluate(const DataFile& data_file) const;
+
+ private:
+ explicit StrictMetricsEvaluator(std::shared_ptr<Expression> expr,
+ std::shared_ptr<Schema> schema);
+
+ private:
+ std::shared_ptr<Expression> expr_;
+ std::shared_ptr<Schema> schema_;
+};
+
+} // namespace iceberg
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index 5a993338..c139c66b 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -51,6 +51,7 @@ iceberg_sources = files(
'expression/literal.cc',
'expression/predicate.cc',
'expression/rewrite_not.cc',
+ 'expression/strict_metrics_evaluator.cc',
'expression/term.cc',
'file_reader.cc',
'file_writer.cc',
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index a13d1f82..9892e3d4 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -87,7 +87,8 @@ add_iceberg_test(expression_test
literal_test.cc
inclusive_metrics_evaluator_test.cc
inclusive_metrics_evaluator_with_transform_test.cc
- predicate_test.cc)
+ predicate_test.cc
+ strict_metrics_evaluator_test.cc)
add_iceberg_test(json_serde_test
SOURCES
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index 4cb153ba..c73abe18 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -65,6 +65,7 @@ iceberg_tests = {
'inclusive_metrics_evaluator_with_transform_test.cc',
'literal_test.cc',
'predicate_test.cc',
+ 'strict_metrics_evaluator_test.cc',
),
},
'json_serde_test': {
diff --git a/src/iceberg/test/strict_metrics_evaluator_test.cc
b/src/iceberg/test/strict_metrics_evaluator_test.cc
new file mode 100644
index 00000000..fa6185c3
--- /dev/null
+++ b/src/iceberg/test/strict_metrics_evaluator_test.cc
@@ -0,0 +1,849 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/expression/strict_metrics_evaluator.h"
+
+#include <limits>
+
+#include <gtest/gtest.h>
+
+#include "iceberg/expression/binder.h"
+#include "iceberg/expression/expressions.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/schema.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
+
+namespace iceberg {
+
+namespace {
+constexpr bool kRowsMustMatch = true;
+constexpr bool kRowsMightNotMatch = false;
+} // namespace
+using TestVariant = std::variant<bool, int32_t, int64_t, double, std::string>;
+
+class StrictMetricsEvaluatorTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ schema_ = std::make_shared<Schema>(
+ std::vector<SchemaField>{
+ SchemaField::MakeRequired(1, "id", int64()),
+ SchemaField::MakeOptional(2, "name", string()),
+ SchemaField::MakeRequired(3, "age", int32()),
+ SchemaField::MakeOptional(4, "salary", float64()),
+ SchemaField::MakeRequired(5, "active", boolean()),
+ SchemaField::MakeRequired(6, "date", string()),
+ },
+ /*schema_id=*/0);
+ }
+
+ Result<std::shared_ptr<Expression>> Bind(const std::shared_ptr<Expression>&
expr,
+ bool case_sensitive = true) {
+ return Binder::Bind(*schema_, expr, case_sensitive);
+ }
+
+ std::shared_ptr<DataFile> PrepareDataFile(
+ const std::string& partition, int64_t record_count, int64_t
file_size_in_bytes,
+ const std::map<std::string, TestVariant>& lower_bounds,
+ const std::map<std::string, TestVariant>& upper_bounds,
+ const std::map<int32_t, int64_t>& value_counts = {},
+ const std::map<int32_t, int64_t>& null_counts = {},
+ const std::map<int32_t, int64_t>& nan_counts = {}) {
+ auto parse_bound = [&](const std::map<std::string, TestVariant>& bounds,
+ std::map<int32_t, std::vector<uint8_t>>&
bound_values) {
+ for (const auto& [key, value] : bounds) {
+ if (key == "id") {
+ bound_values[1] =
Literal::Long(std::get<int64_t>(value)).Serialize().value();
+ } else if (key == "name") {
+ bound_values[2] =
+
Literal::String(std::get<std::string>(value)).Serialize().value();
+ } else if (key == "age") {
+ bound_values[3] =
Literal::Int(std::get<int32_t>(value)).Serialize().value();
+ } else if (key == "salary") {
+ bound_values[4] =
Literal::Double(std::get<double>(value)).Serialize().value();
+ } else if (key == "active") {
+ bound_values[5] =
Literal::Boolean(std::get<bool>(value)).Serialize().value();
+ }
+ }
+ };
+
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->partition.AddValue(Literal::String(partition));
+ data_file->record_count = record_count;
+ data_file->file_size_in_bytes = file_size_in_bytes;
+ data_file->column_sizes = {};
+ data_file->value_counts = value_counts;
+ data_file->null_value_counts = null_counts;
+ data_file->nan_value_counts = nan_counts;
+ data_file->split_offsets = {1};
+ data_file->sort_order_id = 0;
+ parse_bound(upper_bounds, data_file->upper_bounds);
+ parse_bound(lower_bounds, data_file->lower_bounds);
+ return data_file;
+ }
+
+ void TestCase(const std::shared_ptr<Expression>& unbound, bool
expected_result) {
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+ /*file_size_in_bytes=*/1024,
+ /*lower_bounds=*/{{"id",
static_cast<int64_t>(100)}},
+ /*upper_bounds=*/{{"id",
static_cast<int64_t>(200)}},
+ /*value_counts=*/{{1, 10}},
/*null_counts=*/{{1, 0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+ }
+
+ void TestStringCase(const std::shared_ptr<Expression>& unbound, bool
expected_result) {
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+ /*file_size_in_bytes=*/1024,
+ /*lower_bounds=*/{{"name", "123"}}, {{"name",
"456"}},
+ /*value_counts=*/{{2, 10}},
/*null_counts=*/{{2, 0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+ }
+
+ std::shared_ptr<Schema> schema_;
+};
+
+TEST_F(StrictMetricsEvaluatorTest, CaseSensitiveTest) {
+ {
+ auto unbound = Expressions::Equal("id", Literal::Long(300));
+ auto evaluator = StrictMetricsEvaluator::Make(unbound, schema_, true);
+ ASSERT_TRUE(evaluator.has_value());
+ }
+ {
+ auto unbound = Expressions::Equal("ID", Literal::Long(300));
+ auto evaluator = StrictMetricsEvaluator::Make(unbound, schema_, true);
+ ASSERT_FALSE(evaluator.has_value());
+ ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression);
+ }
+ {
+ auto unbound = Expressions::Equal("ID", Literal::Long(300));
+ auto evaluator = StrictMetricsEvaluator::Make(unbound, schema_, false);
+ ASSERT_TRUE(evaluator.has_value());
+ }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, IsNullTest) {
+ {
+ auto unbound = Expressions::IsNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 5}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::IsNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 10}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotNullTest) {
+ {
+ auto unbound = Expressions::NotNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 5}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::NotNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 0}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, IsNanTest) {
+ {
+ auto unbound = Expressions::IsNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4,
5}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::IsNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4,
5}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::IsNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4,
10}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotNanTest) {
+ {
+ auto unbound = Expressions::NotNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::NotNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::NotNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, LTTest) {
+ TestCase(Expressions::LessThan("id", Literal::Long(300)), kRowsMustMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(150)),
kRowsMightNotMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(100)),
kRowsMightNotMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(200)),
kRowsMightNotMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(99)), kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, LTEQTest) {
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(300)),
kRowsMustMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(150)),
kRowsMightNotMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(100)),
kRowsMightNotMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(200)),
kRowsMustMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(99)),
kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, GTTest) {
+ TestCase(Expressions::GreaterThan("id", Literal::Long(300)),
kRowsMightNotMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(150)),
kRowsMightNotMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(100)),
kRowsMightNotMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(200)),
kRowsMightNotMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(99)), kRowsMustMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, GTEQTest) {
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(300)),
kRowsMightNotMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(150)),
kRowsMightNotMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(100)),
kRowsMustMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(200)),
kRowsMightNotMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(99)),
kRowsMustMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, EQTest) {
+ TestCase(Expressions::Equal("id", Literal::Long(300)), kRowsMightNotMatch);
+ TestCase(Expressions::Equal("id", Literal::Long(150)), kRowsMightNotMatch);
+ TestCase(Expressions::Equal("id", Literal::Long(100)), kRowsMightNotMatch);
+ TestCase(Expressions::Equal("id", Literal::Long(200)), kRowsMightNotMatch);
+
+ auto test_case = [&](const std::shared_ptr<Expression>& unbound, bool
expected_result) {
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+ /*file_size_in_bytes=*/1024,
+ /*lower_bounds=*/{{"id",
static_cast<int64_t>(100)}},
+ /*upper_bounds=*/{{"id",
static_cast<int64_t>(100)}},
+ /*value_counts=*/{{1, 10}},
/*null_counts=*/{{1, 0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+ };
+ test_case(Expressions::Equal("id", Literal::Long(100)), kRowsMustMatch);
+ test_case(Expressions::Equal("id", Literal::Long(200)), kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotEqTest) {
+ TestCase(Expressions::NotEqual("id", Literal::Long(300)), kRowsMustMatch);
+ TestCase(Expressions::NotEqual("id", Literal::Long(150)),
kRowsMightNotMatch);
+ TestCase(Expressions::NotEqual("id", Literal::Long(100)),
kRowsMightNotMatch);
+ TestCase(Expressions::NotEqual("id", Literal::Long(200)),
kRowsMightNotMatch);
+ TestCase(Expressions::NotEqual("id", Literal::Long(99)), kRowsMustMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, InTest) {
+ TestCase(Expressions::In("id",
+ {
+ Literal::Long(100),
+ Literal::Long(200),
+ Literal::Long(300),
+ Literal::Long(400),
+ Literal::Long(500),
+ }),
+ kRowsMightNotMatch);
+
+ auto test_case = [&](const std::shared_ptr<Expression>& unbound, bool
expected_result) {
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(unbound, schema_,
true));
+ auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+ /*file_size_in_bytes=*/1024,
+ /*lower_bounds=*/{{"id",
static_cast<int64_t>(100)}},
+ /*upper_bounds=*/{{"id",
static_cast<int64_t>(100)}},
+ /*value_counts=*/{{1, 10}},
/*null_counts=*/{{1, 0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+ };
+ test_case(Expressions::In("id", {Literal::Long(100), Literal::Long(200)}),
+ kRowsMustMatch);
+ test_case(Expressions::In("id", {Literal::Long(200), Literal::Long(300)}),
+ kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotInTest) {
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(88),
+ Literal::Long(99),
+ }),
+ kRowsMustMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(288),
+ Literal::Long(299),
+ }),
+ kRowsMustMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(88),
+ Literal::Long(288),
+ Literal::Long(299),
+ }),
+ kRowsMustMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(88),
+ Literal::Long(100),
+ }),
+ kRowsMightNotMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(88),
+ Literal::Long(101),
+ }),
+ kRowsMightNotMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(100),
+ Literal::Long(101),
+ }),
+ kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, StartsWithTest) {
+ // always true
+ TestStringCase(Expressions::StartsWith("name", "1"), kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotStartsWithTest) {
+ TestStringCase(Expressions::NotStartsWith("name", "1"), kRowsMightNotMatch);
+}
+
+class StrictMetricsEvaluatorMigratedTest : public StrictMetricsEvaluatorTest {
+ protected:
+ static constexpr int64_t kIntMinValue = 30;
+ static constexpr int64_t kIntMaxValue = 79;
+ static constexpr int64_t kAlwaysFive = 5;
+
+ void SetUp() override {
+ schema_ = std::make_shared<Schema>(
+ std::vector<SchemaField>{
+ SchemaField::MakeRequired(1, "id", int64()),
+ SchemaField::MakeOptional(2, "no_stats", int64()),
+ SchemaField::MakeRequired(3, "required", string()),
+ SchemaField::MakeOptional(4, "all_nulls", string()),
+ SchemaField::MakeOptional(5, "some_nulls", string()),
+ SchemaField::MakeOptional(6, "no_nulls", string()),
+ SchemaField::MakeRequired(7, "always_5", int64()),
+ SchemaField::MakeOptional(8, "all_nans", float64()),
+ SchemaField::MakeOptional(9, "some_nans", float32()),
+ SchemaField::MakeOptional(10, "no_nans", float32()),
+ SchemaField::MakeOptional(11, "all_nulls_double", float64()),
+ SchemaField::MakeOptional(12, "all_nans_v1_stats", float32()),
+ SchemaField::MakeOptional(13, "nan_and_null_only", float64()),
+ SchemaField::MakeOptional(14, "no_nan_stats", float64()),
+ SchemaField::MakeOptional(
+ 15, "struct",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeOptional(16, "nested_col_no_stats",
int64()),
+ SchemaField::MakeOptional(17, "nested_col_with_stats",
int64())})),
+ },
+ /*schema_id=*/0);
+
+ file_ = MakePrimaryFile();
+ file_with_bounds_ = MakeSomeNullsFile();
+ file_with_equal_bounds_ = MakeSomeNullsEqualBoundsFile();
+ }
+
+ std::shared_ptr<DataFile> MakePrimaryFile() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "file.avro";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {4, 50L}, {5, 50L}, {6, 50L}, {8, 50L}, {9, 50L}, {10, 50L},
+ {11, 50L}, {12, 50L}, {13, 50L}, {14, 50L}, {17, 50L},
+ };
+ data_file->null_value_counts = {
+ {4, 50L}, {5, 10L}, {6, 0L}, {11, 50L}, {12, 0L}, {13, 1L}, {17, 0L},
+ };
+ data_file->nan_value_counts = {
+ {8, 50L},
+ {9, 10L},
+ {10, 0L},
+ };
+ const float float_nan = std::numeric_limits<float>::quiet_NaN();
+ const double double_nan = std::numeric_limits<double>::quiet_NaN();
+ data_file->lower_bounds = {
+ {1, Literal::Long(kIntMinValue).Serialize().value()},
+ {7, Literal::Long(kAlwaysFive).Serialize().value()},
+ {12, Literal::Float(float_nan).Serialize().value()},
+ {13, Literal::Double(double_nan).Serialize().value()},
+ {17, Literal::Long(kIntMinValue).Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {1, Literal::Long(kIntMaxValue).Serialize().value()},
+ {7, Literal::Long(kAlwaysFive).Serialize().value()},
+ {12, Literal::Float(float_nan).Serialize().value()},
+ {13, Literal::Double(double_nan).Serialize().value()},
+ {17, Literal::Long(kIntMaxValue).Serialize().value()},
+ };
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> MakeSomeNullsFile() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "file_2.avro";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {4, 50L},
+ {5, 50L},
+ {6, 50L},
+ {8, 50L},
+ };
+ data_file->null_value_counts = {
+ {4, 50L},
+ {5, 10L},
+ {6, 0L},
+ };
+ data_file->lower_bounds = {
+ {5, Literal::String("bbb").Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {5, Literal::String("eee").Serialize().value()},
+ };
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> MakeSomeNullsEqualBoundsFile() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "file_3.avro";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {4, 50L},
+ {5, 50L},
+ {6, 50L},
+ };
+ data_file->null_value_counts = {
+ {4, 50L},
+ {5, 10L},
+ {6, 0L},
+ };
+ data_file->lower_bounds = {
+ {5, Literal::String("bbb").Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {5, Literal::String("bbb").Serialize().value()},
+ };
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> MakeMissingStatsFile() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "missing.parquet";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> MakeZeroRecordFile() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "zero.parquet";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 0;
+ return data_file;
+ }
+
+ void ExpectShouldRead(const std::shared_ptr<Expression>& expr, bool expected,
+ std::shared_ptr<DataFile> file = nullptr,
+ bool case_sensitive = true) {
+ auto target = file ? file : file_;
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ StrictMetricsEvaluator::Make(expr, schema_,
case_sensitive));
+ auto eval_result = evaluator->Evaluate(*target);
+ ASSERT_TRUE(eval_result.has_value());
+ ASSERT_EQ(eval_result.value(), expected) << expr->ToString();
+ }
+
+ std::shared_ptr<Schema> schema_;
+ std::shared_ptr<DataFile> file_;
+ std::shared_ptr<DataFile> file_with_bounds_;
+ std::shared_ptr<DataFile> file_with_equal_bounds_;
+};
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, AllNulls) {
+ ExpectShouldRead(Expressions::NotNull("all_nulls"), false);
+ ExpectShouldRead(Expressions::NotNull("some_nulls"), false);
+ ExpectShouldRead(Expressions::NotNull("no_nulls"), true);
+ ExpectShouldRead(Expressions::NotEqual("all_nulls", Literal::String("a")),
true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, NoNulls) {
+ ExpectShouldRead(Expressions::IsNull("all_nulls"), true);
+ ExpectShouldRead(Expressions::IsNull("some_nulls"), false);
+ ExpectShouldRead(Expressions::IsNull("no_nulls"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, SomeNulls) {
+ ExpectShouldRead(Expressions::LessThan("some_nulls",
Literal::String("ggg")), false,
+ file_with_bounds_);
+ ExpectShouldRead(Expressions::LessThanOrEqual("some_nulls",
Literal::String("eee")),
+ false, file_with_bounds_);
+ ExpectShouldRead(Expressions::GreaterThan("some_nulls",
Literal::String("aaa")), false,
+ file_with_bounds_);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual("some_nulls",
Literal::String("bbb")),
+ false, file_with_bounds_);
+ ExpectShouldRead(Expressions::Equal("some_nulls", Literal::String("bbb")),
false,
+ file_with_equal_bounds_);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IsNaN) {
+ ExpectShouldRead(Expressions::IsNaN("all_nans"), true);
+ ExpectShouldRead(Expressions::IsNaN("some_nans"), false);
+ ExpectShouldRead(Expressions::IsNaN("no_nans"), false);
+ ExpectShouldRead(Expressions::IsNaN("all_nulls_double"), false);
+ ExpectShouldRead(Expressions::IsNaN("no_nan_stats"), false);
+ ExpectShouldRead(Expressions::IsNaN("all_nans_v1_stats"), false);
+ ExpectShouldRead(Expressions::IsNaN("nan_and_null_only"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, NotNaN) {
+ ExpectShouldRead(Expressions::NotNaN("all_nans"), false);
+ ExpectShouldRead(Expressions::NotNaN("some_nans"), false);
+ ExpectShouldRead(Expressions::NotNaN("no_nans"), true);
+ ExpectShouldRead(Expressions::NotNaN("all_nulls_double"), true);
+ ExpectShouldRead(Expressions::NotNaN("no_nan_stats"), false);
+ ExpectShouldRead(Expressions::NotNaN("all_nans_v1_stats"), false);
+ ExpectShouldRead(Expressions::NotNaN("nan_and_null_only"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, RequiredColumn) {
+ ExpectShouldRead(Expressions::NotNull("required"), true);
+ ExpectShouldRead(Expressions::IsNull("required"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, MissingColumn) {
+ auto expr = Expressions::LessThan("missing", Literal::Long(5));
+ auto evaluator = StrictMetricsEvaluator::Make(expr, schema_, true);
+ ASSERT_FALSE(evaluator.has_value());
+ EXPECT_TRUE(evaluator.error().message.contains("Cannot find field
'missing'"))
+ << evaluator.error().message;
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, MissingStats) {
+ auto missing_stats = MakeMissingStatsFile();
+ std::vector<std::shared_ptr<Expression>> expressions = {
+ Expressions::LessThan("no_stats", Literal::Long(5)),
+ Expressions::LessThanOrEqual("no_stats", Literal::Long(30)),
+ Expressions::Equal("no_stats", Literal::Long(70)),
+ Expressions::GreaterThan("no_stats", Literal::Long(78)),
+ Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)),
+ Expressions::NotEqual("no_stats", Literal::Long(101)),
+ Expressions::IsNull("no_stats"),
+ Expressions::NotNull("no_stats"),
+ Expressions::IsNaN("all_nans"),
+ Expressions::NotNaN("all_nans"),
+ };
+ for (const auto& expr : expressions) {
+ ExpectShouldRead(expr, false, missing_stats);
+ }
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, ZeroRecordFile) {
+ auto zero_record_file = MakeZeroRecordFile();
+ std::vector<std::shared_ptr<Expression>> expressions = {
+ Expressions::LessThan("id", Literal::Long(5)),
+ Expressions::LessThanOrEqual("id", Literal::Long(30)),
+ Expressions::Equal("id", Literal::Long(70)),
+ Expressions::GreaterThan("id", Literal::Long(78)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(90)),
+ Expressions::NotEqual("id", Literal::Long(101)),
+ Expressions::IsNull("some_nulls"),
+ Expressions::NotNull("some_nulls"),
+ Expressions::IsNaN("all_nans"),
+ Expressions::NotNaN("all_nans"),
+ };
+ for (const auto& expr : expressions) {
+ ExpectShouldRead(expr, true, zero_record_file);
+ }
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, Not) {
+ ExpectShouldRead(
+ Expressions::Not(Expressions::LessThan("id", Literal::Long(kIntMinValue
- 25))),
+ true);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::GreaterThan("id",
Literal::Long(kIntMinValue - 25))),
+ false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, And) {
+ ExpectShouldRead(
+ Expressions::And(Expressions::GreaterThan("id",
Literal::Long(kIntMinValue - 25)),
+ Expressions::LessThanOrEqual("id",
Literal::Long(kIntMinValue))),
+ false);
+ ExpectShouldRead(
+ Expressions::And(
+ Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMinValue -
30))),
+ false);
+ ExpectShouldRead(
+ Expressions::And(
+ Expressions::LessThan("id", Literal::Long(kIntMaxValue + 6)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMinValue -
30))),
+ true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, Or) {
+ ExpectShouldRead(
+ Expressions::Or(
+ Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue +
1))),
+ false);
+ ExpectShouldRead(
+ Expressions::Or(
+ Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue -
19))),
+ false);
+ ExpectShouldRead(
+ Expressions::Or(Expressions::LessThan("id", Literal::Long(kIntMinValue -
25)),
+ Expressions::GreaterThanOrEqual("id",
Literal::Long(kIntMinValue))),
+ true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerLt) {
+ ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMinValue)),
false);
+ ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMinValue +
1)), false);
+ ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMaxValue)),
false);
+ ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMaxValue +
1)), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerLtEq) {
+ ExpectShouldRead(Expressions::LessThanOrEqual("id",
Literal::Long(kIntMinValue - 1)),
+ false);
+ ExpectShouldRead(Expressions::LessThanOrEqual("id",
Literal::Long(kIntMinValue)),
+ false);
+ ExpectShouldRead(Expressions::LessThanOrEqual("id",
Literal::Long(kIntMaxValue)), true);
+ ExpectShouldRead(Expressions::LessThanOrEqual("id",
Literal::Long(kIntMaxValue + 1)),
+ true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerGt) {
+ ExpectShouldRead(Expressions::GreaterThan("id",
Literal::Long(kIntMaxValue)), false);
+ ExpectShouldRead(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue -
1)),
+ false);
+ ExpectShouldRead(Expressions::GreaterThan("id",
Literal::Long(kIntMinValue)), false);
+ ExpectShouldRead(Expressions::GreaterThan("id", Literal::Long(kIntMinValue -
1)), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerGtEq) {
+ ExpectShouldRead(Expressions::GreaterThanOrEqual("id",
Literal::Long(kIntMaxValue + 1)),
+ false);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual("id",
Literal::Long(kIntMaxValue)),
+ false);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual("id",
Literal::Long(kIntMinValue + 1)),
+ false);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual("id",
Literal::Long(kIntMinValue)),
+ true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerEq) {
+ ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMinValue - 25)),
false);
+ ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMinValue)),
false);
+ ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMaxValue - 4)),
false);
+ ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMaxValue)),
false);
+ ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMaxValue + 1)),
false);
+ ExpectShouldRead(Expressions::Equal("always_5", Literal::Long(kIntMinValue -
25)),
+ true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerNotEq) {
+ ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMinValue -
25)), true);
+ ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMinValue -
1)), true);
+ ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMinValue)),
false);
+ ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue -
4)), false);
+ ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue)),
false);
+ ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue +
1)), true);
+ ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue +
6)), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerNotEqRewritten) {
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue -
25))), true);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue -
1))), true);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue))),
false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue -
4))), false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue))),
false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue +
1))), true);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue +
6))), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerIn) {
+ ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMinValue - 25),
+ Literal::Long(kIntMinValue - 24)}),
+ false);
+ ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMinValue - 1),
+ Literal::Long(kIntMinValue)}),
+ false);
+ ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMaxValue - 4),
+ Literal::Long(kIntMaxValue - 3)}),
+ false);
+ ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMaxValue),
+ Literal::Long(kIntMaxValue + 1)}),
+ false);
+ ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMaxValue + 1),
+ Literal::Long(kIntMaxValue + 2)}),
+ false);
+ ExpectShouldRead(Expressions::In("always_5", {Literal::Long(5),
Literal::Long(6)}),
+ true);
+ ExpectShouldRead(
+ Expressions::In("all_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ false);
+ ExpectShouldRead(
+ Expressions::In("some_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ false, file_with_equal_bounds_);
+ ExpectShouldRead(
+ Expressions::In("no_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerNotIn) {
+ ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMinValue - 25),
+ Literal::Long(kIntMinValue -
24)}),
+ true);
+ ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMinValue - 1),
+ Literal::Long(kIntMinValue)}),
+ false);
+ ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMaxValue - 4),
+ Literal::Long(kIntMaxValue - 3)}),
+ false);
+ ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMaxValue),
+ Literal::Long(kIntMaxValue + 1)}),
+ false);
+ ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMaxValue + 1),
+ Literal::Long(kIntMaxValue + 2)}),
+ true);
+ ExpectShouldRead(Expressions::NotIn("always_5", {Literal::Long(5),
Literal::Long(6)}),
+ false);
+ ExpectShouldRead(
+ Expressions::NotIn("all_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ true);
+ ExpectShouldRead(
+ Expressions::NotIn("some_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ true, file_with_equal_bounds_);
+ ExpectShouldRead(
+ Expressions::NotIn("no_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, EvaluateOnNestedColumnWithoutStats)
{
+
ExpectShouldRead(Expressions::GreaterThanOrEqual("struct.nested_col_no_stats",
+
Literal::Long(kIntMinValue)),
+ false);
+ ExpectShouldRead(Expressions::LessThanOrEqual("struct.nested_col_no_stats",
+ Literal::Long(kIntMaxValue)),
+ false);
+ ExpectShouldRead(Expressions::IsNull("struct.nested_col_no_stats"), false);
+ ExpectShouldRead(Expressions::NotNull("struct.nested_col_no_stats"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, EvaluateOnNestedColumnWithStats) {
+
ExpectShouldRead(Expressions::GreaterThanOrEqual("struct.nested_col_with_stats",
+
Literal::Long(kIntMinValue)),
+ false);
+ ExpectShouldRead(Expressions::LessThanOrEqual("struct.nested_col_with_stats",
+ Literal::Long(kIntMaxValue)),
+ false);
+ ExpectShouldRead(Expressions::IsNull("struct.nested_col_with_stats"), false);
+ ExpectShouldRead(Expressions::NotNull("struct.nested_col_with_stats"),
false);
+}
+
+} // namespace iceberg