This is an automated email from the ASF dual-hosted git repository.
etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new 607a2fd refactor(evaluators): shift evaluator code into the table
package for future development (#123)
607a2fd is described below
commit 607a2fd1ecb3f66e5b578cd438907ddb31aa2460
Author: Matt Topol <[email protected]>
AuthorDate: Sun Aug 18 06:21:45 2024 -0400
refactor(evaluators): shift evaluator code into the table package for
future development (#123)
---
table/evaluators.go | 490 +++++++++++++++++++++++++++++++++++++++++++++
table/evaluators_test.go | 505 +++++++++++++++++++++++++++++++++++++++++++++++
visitors.go | 469 +------------------------------------------
visitors_test.go | 478 --------------------------------------------
4 files changed, 996 insertions(+), 946 deletions(-)
diff --git a/table/evaluators.go b/table/evaluators.go
new file mode 100644
index 0000000..343c8ef
--- /dev/null
+++ b/table/evaluators.go
@@ -0,0 +1,490 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package table
+
+import (
+ "github.com/apache/iceberg-go"
+ "github.com/google/uuid"
+)
+
+const (
+ rowsMightMatch, rowsMustMatch = true, true
+ rowsCannotMatch, rowsMightNotMatch = false, false
+ inPredicateLimit = 200
+)
+
+// newManifestEvaluator returns a function that can be used to evaluate
whether a particular
+// manifest file has rows that might or might not match a given partition
filter by using
+// the stats provided in the partitions
(UpperBound/LowerBound/ContainsNull/ContainsNaN).
+func newManifestEvaluator(spec iceberg.PartitionSpec, schema *iceberg.Schema,
partitionFilter iceberg.BooleanExpression, caseSensitive bool)
(func(iceberg.ManifestFile) (bool, error), error) {
+ partType := spec.PartitionType(schema)
+ partSchema := iceberg.NewSchema(0, partType.FieldList...)
+ filter, err := iceberg.RewriteNotExpr(partitionFilter)
+ if err != nil {
+ return nil, err
+ }
+
+ boundFilter, err := iceberg.BindExpr(partSchema, filter, caseSensitive)
+ if err != nil {
+ return nil, err
+ }
+
+ return (&manifestEvalVisitor{partitionFilter: boundFilter}).Eval, nil
+}
+
+type manifestEvalVisitor struct {
+ partitionFields []iceberg.FieldSummary
+ partitionFilter iceberg.BooleanExpression
+}
+
+func (m *manifestEvalVisitor) Eval(manifest iceberg.ManifestFile) (bool,
error) {
+ if parts := manifest.Partitions(); len(parts) > 0 {
+ m.partitionFields = parts
+ return iceberg.VisitExpr(m.partitionFilter, m)
+ }
+
+ return rowsMightMatch, nil
+}
+
+func allBoundCmp[T iceberg.LiteralType](bound iceberg.Literal, set
iceberg.Set[iceberg.Literal], want int) bool {
+ val := bound.(iceberg.TypedLiteral[T])
+ cmp := val.Comparator()
+
+ return set.All(func(e iceberg.Literal) bool {
+ return cmp(val.Value(), e.(iceberg.TypedLiteral[T]).Value()) ==
want
+ })
+}
+
+func allBoundCheck(bound iceberg.Literal, set iceberg.Set[iceberg.Literal],
want int) bool {
+ switch bound.Type().(type) {
+ case iceberg.BooleanType:
+ return allBoundCmp[bool](bound, set, want)
+ case iceberg.Int32Type:
+ return allBoundCmp[int32](bound, set, want)
+ case iceberg.Int64Type:
+ return allBoundCmp[int64](bound, set, want)
+ case iceberg.Float32Type:
+ return allBoundCmp[float32](bound, set, want)
+ case iceberg.Float64Type:
+ return allBoundCmp[float64](bound, set, want)
+ case iceberg.DateType:
+ return allBoundCmp[iceberg.Date](bound, set, want)
+ case iceberg.TimeType:
+ return allBoundCmp[iceberg.Time](bound, set, want)
+ case iceberg.TimestampType, iceberg.TimestampTzType:
+ return allBoundCmp[iceberg.Timestamp](bound, set, want)
+ case iceberg.BinaryType, iceberg.FixedType:
+ return allBoundCmp[[]byte](bound, set, want)
+ case iceberg.StringType:
+ return allBoundCmp[string](bound, set, want)
+ case iceberg.UUIDType:
+ return allBoundCmp[uuid.UUID](bound, set, want)
+ case iceberg.DecimalType:
+ return allBoundCmp[iceberg.Decimal](bound, set, want)
+ }
+ panic(iceberg.ErrType)
+}
+
+func (m *manifestEvalVisitor) VisitIn(term iceberg.BoundTerm, literals
iceberg.Set[iceberg.Literal]) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.LowerBound == nil {
+ return rowsCannotMatch
+ }
+
+ if literals.Len() > inPredicateLimit {
+ return rowsMightMatch
+ }
+
+ lower, err := iceberg.LiteralFromBytes(term.Type(), *field.LowerBound)
+ if err != nil {
+ panic(err)
+ }
+
+ if allBoundCheck(lower, literals, 1) {
+ return rowsCannotMatch
+ }
+
+ if field.UpperBound != nil {
+ upper, err := iceberg.LiteralFromBytes(term.Type(),
*field.UpperBound)
+ if err != nil {
+ panic(err)
+ }
+
+ if allBoundCheck(upper, literals, -1) {
+ return rowsCannotMatch
+ }
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitNotIn(term iceberg.BoundTerm, literals
iceberg.Set[iceberg.Literal]) bool {
+ // because the bounds are not necessarily a min or max value, this
cannot be answered using them
+ // notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value
in col
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitIsNan(term iceberg.BoundTerm) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.ContainsNaN != nil && !*field.ContainsNaN {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitNotNan(term iceberg.BoundTerm) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.ContainsNaN != nil && *field.ContainsNaN &&
!field.ContainsNull && field.LowerBound == nil {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitIsNull(term iceberg.BoundTerm) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if !field.ContainsNull {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitNotNull(term iceberg.BoundTerm) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ // ContainsNull encodes whether at least one partition value is null
+ // lowerBound is null if all partition values are null
+ allNull := field.ContainsNull && field.LowerBound == nil
+ if allNull && (term.Ref().Type().Equals(iceberg.PrimitiveTypes.Float32)
|| term.Ref().Type().Equals(iceberg.PrimitiveTypes.Float64)) {
+ // floating point types may include NaN values, which we check
separately
+ // in case bounds don't include NaN values, ContainsNaN needsz
to be checked
+ allNull = field.ContainsNaN != nil && !*field.ContainsNaN
+ }
+
+ if allNull {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func getCmp[T iceberg.LiteralType](b iceberg.TypedLiteral[T])
func(iceberg.Literal, iceberg.Literal) int {
+ cmp := b.Comparator()
+ return func(l1, l2 iceberg.Literal) int {
+ return cmp(l1.(iceberg.TypedLiteral[T]).Value(),
l2.(iceberg.TypedLiteral[T]).Value())
+ }
+}
+
+func getCmpLiteral(boundary iceberg.Literal) func(iceberg.Literal,
iceberg.Literal) int {
+ switch l := boundary.(type) {
+ case iceberg.TypedLiteral[bool]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[int32]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[int64]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[float32]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[float64]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[iceberg.Date]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[iceberg.Time]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[iceberg.Timestamp]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[[]byte]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[string]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[uuid.UUID]:
+ return getCmp(l)
+ case iceberg.TypedLiteral[iceberg.Decimal]:
+ return getCmp(l)
+ }
+ panic(iceberg.ErrType)
+}
+
+func (m *manifestEvalVisitor) VisitEqual(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.LowerBound == nil || field.UpperBound == nil {
+ // values are all null and literal cannot contain null
+ return rowsCannotMatch
+ }
+
+ lower, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.LowerBound)
+ if err != nil {
+ panic(err)
+ }
+
+ cmp := getCmpLiteral(lower)
+ if cmp(lower, lit) == 1 {
+ return rowsCannotMatch
+ }
+
+ upper, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.UpperBound)
+ if err != nil {
+ panic(err)
+ }
+
+ if cmp(lit, upper) == 1 {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitNotEqual(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ // because bounds are not necessarily a min or max, this cannot be
answered
+ // using them. notEq(col, X) with (X, Y) doesn't guarantee X is a value
in col
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitGreaterEqual(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.UpperBound == nil {
+ return rowsCannotMatch
+ }
+
+ upper, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.UpperBound)
+ if err != nil {
+ panic(err)
+ }
+
+ if getCmpLiteral(upper)(lit, upper) == 1 {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitGreater(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.UpperBound == nil {
+ return rowsCannotMatch
+ }
+
+ upper, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.UpperBound)
+ if err != nil {
+ panic(err)
+ }
+
+ if getCmpLiteral(upper)(lit, upper) >= 0 {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitLessEqual(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.LowerBound == nil {
+ return rowsCannotMatch
+ }
+
+ lower, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.LowerBound)
+ if err != nil {
+ panic(err)
+ }
+
+ if getCmpLiteral(lower)(lit, lower) == -1 {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitLess(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.LowerBound == nil {
+ return rowsCannotMatch
+ }
+
+ lower, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.LowerBound)
+ if err != nil {
+ panic(err)
+ }
+
+ if getCmpLiteral(lower)(lit, lower) <= 0 {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitStartsWith(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ var prefix string
+ if val, ok := lit.(iceberg.TypedLiteral[string]); ok {
+ prefix = val.Value()
+ } else {
+ prefix = string(lit.(iceberg.TypedLiteral[[]byte]).Value())
+ }
+
+ lenPrefix := len(prefix)
+
+ if field.LowerBound == nil {
+ return rowsCannotMatch
+ }
+
+ lower, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.LowerBound)
+ if err != nil {
+ panic(err)
+ }
+
+ // truncate lower bound so that it's length is not greater than the
length of prefix
+ var v string
+ switch l := lower.(type) {
+ case iceberg.TypedLiteral[string]:
+ v = l.Value()
+ if len(v) > lenPrefix {
+ v = v[:lenPrefix]
+ }
+ case iceberg.TypedLiteral[[]byte]:
+ v = string(l.Value())
+ if len(v) > lenPrefix {
+ v = v[:lenPrefix]
+ }
+ }
+
+ if v > prefix {
+ return rowsCannotMatch
+ }
+
+ if field.UpperBound == nil {
+ return rowsCannotMatch
+ }
+
+ upper, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.UpperBound)
+ if err != nil {
+ panic(err)
+ }
+
+ switch u := upper.(type) {
+ case iceberg.TypedLiteral[string]:
+ v = u.Value()
+ if len(v) > lenPrefix {
+ v = v[:lenPrefix]
+ }
+ case iceberg.TypedLiteral[[]byte]:
+ v = string(u.Value())
+ if len(v) > lenPrefix {
+ v = v[:lenPrefix]
+ }
+ }
+
+ if v < prefix {
+ return rowsCannotMatch
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitNotStartsWith(term iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ pos := term.Ref().Pos()
+ field := m.partitionFields[pos]
+
+ if field.ContainsNull || field.LowerBound == nil || field.UpperBound ==
nil {
+ return rowsMightMatch
+ }
+
+ // NotStartsWith will match unless ALL values must start with the
prefix.
+ // this happens when the lower and upper bounds BOTH start with the
prefix
+ lower, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.LowerBound)
+ if err != nil {
+ panic(err)
+ }
+
+ upper, err := iceberg.LiteralFromBytes(term.Ref().Type(),
*field.UpperBound)
+ if err != nil {
+ panic(err)
+ }
+
+ var (
+ prefix, lowerBound, upperBound string
+ )
+ if val, ok := lit.(iceberg.TypedLiteral[string]); ok {
+ prefix = val.Value()
+ lowerBound, upperBound =
lower.(iceberg.TypedLiteral[string]).Value(),
upper.(iceberg.TypedLiteral[string]).Value()
+ } else {
+ prefix = string(lit.(iceberg.TypedLiteral[[]byte]).Value())
+ lowerBound =
string(lower.(iceberg.TypedLiteral[[]byte]).Value())
+ upperBound =
string(upper.(iceberg.TypedLiteral[[]byte]).Value())
+ }
+
+ lenPrefix := len(prefix)
+ if len(lowerBound) < lenPrefix {
+ return rowsMightMatch
+ }
+
+ if lowerBound[:lenPrefix] == prefix {
+ // if upper is shorter then upper can't start with the prefix
+ if len(upperBound) < lenPrefix {
+ return rowsMightMatch
+ }
+
+ if upperBound[:lenPrefix] == prefix {
+ return rowsCannotMatch
+ }
+ }
+
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitTrue() bool {
+ return rowsMightMatch
+}
+
+func (m *manifestEvalVisitor) VisitFalse() bool {
+ return rowsCannotMatch
+}
+
+func (m *manifestEvalVisitor) VisitUnbound(iceberg.UnboundPredicate) bool {
+ panic("need bound predicate")
+}
+
+func (m *manifestEvalVisitor) VisitBound(pred iceberg.BoundPredicate) bool {
+ return iceberg.VisitBoundPredicate(pred, m)
+}
+
+func (m *manifestEvalVisitor) VisitNot(child bool) bool { return !child }
+func (m *manifestEvalVisitor) VisitAnd(left, right bool) bool { return left &&
right }
+func (m *manifestEvalVisitor) VisitOr(left, right bool) bool { return left ||
right }
diff --git a/table/evaluators_test.go b/table/evaluators_test.go
new file mode 100644
index 0000000..b8c2671
--- /dev/null
+++ b/table/evaluators_test.go
@@ -0,0 +1,505 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package table
+
+import (
+ "testing"
+
+ "github.com/apache/iceberg-go"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+const (
+ IntMinValue, IntMaxValue int32 = 30, 79
+)
+
+func TestManifestEvaluator(t *testing.T) {
+
+ var (
+ IntMin, IntMax = []byte{byte(IntMinValue), 0x00, 0x00,
0x00}, []byte{byte(IntMaxValue), 0x00, 0x00, 0x00}
+ StringMin, StringMax = []byte("a"), []byte("z")
+ FloatMin, _ = iceberg.Float32Literal(0).MarshalBinary()
+ FloatMax, _ =
iceberg.Float32Literal(20).MarshalBinary()
+ DblMin, _ = iceberg.Float64Literal(0).MarshalBinary()
+ DblMax, _ =
iceberg.Float64Literal(20).MarshalBinary()
+ NanTrue, NanFalse = true, false
+
+ testSchema = iceberg.NewSchema(1,
+ iceberg.NestedField{ID: 1, Name: "id",
+ Type: iceberg.PrimitiveTypes.Int32, Required:
true},
+ iceberg.NestedField{ID: 2, Name:
"all_nulls_missing_nan",
+ Type: iceberg.PrimitiveTypes.String, Required:
false},
+ iceberg.NestedField{ID: 3, Name: "some_nulls",
+ Type: iceberg.PrimitiveTypes.String, Required:
false},
+ iceberg.NestedField{ID: 4, Name: "no_nulls",
+ Type: iceberg.PrimitiveTypes.String, Required:
false},
+ iceberg.NestedField{ID: 5, Name: "float",
+ Type: iceberg.PrimitiveTypes.Float32, Required:
false},
+ iceberg.NestedField{ID: 6, Name: "all_nulls_double",
+ Type: iceberg.PrimitiveTypes.Float64, Required:
false},
+ iceberg.NestedField{ID: 7, Name: "all_nulls_no_nans",
+ Type: iceberg.PrimitiveTypes.Float32, Required:
false},
+ iceberg.NestedField{ID: 8, Name: "all_nans",
+ Type: iceberg.PrimitiveTypes.Float64, Required:
false},
+ iceberg.NestedField{ID: 9, Name: "both_nan_and_null",
+ Type: iceberg.PrimitiveTypes.Float32, Required:
false},
+ iceberg.NestedField{ID: 10, Name: "no_nan_or_null",
+ Type: iceberg.PrimitiveTypes.Float64, Required:
false},
+ iceberg.NestedField{ID: 11, Name:
"all_nulls_missing_nan_float",
+ Type: iceberg.PrimitiveTypes.Float32, Required:
false},
+ iceberg.NestedField{ID: 12, Name:
"all_same_value_or_null",
+ Type: iceberg.PrimitiveTypes.String, Required:
false},
+ iceberg.NestedField{ID: 13, Name:
"no_nulls_same_value_a",
+ Type: iceberg.PrimitiveTypes.Binary, Required:
false},
+ )
+ )
+
+ partFields := make([]iceberg.PartitionField, 0, testSchema.NumFields())
+ for _, f := range testSchema.Fields() {
+ partFields = append(partFields, iceberg.PartitionField{
+ Name: f.Name,
+ SourceID: f.ID,
+ FieldID: f.ID,
+ Transform: iceberg.IdentityTransform{},
+ })
+ }
+
+ spec := iceberg.NewPartitionSpec(partFields...)
+ manifestNoStats := iceberg.NewManifestV1Builder("", 0, 0, 0).Build()
+ manifest := iceberg.NewManifestV1Builder("", 0, 0, 0).Partitions(
+ []iceberg.FieldSummary{
+ { // id
+ ContainsNull: false,
+ ContainsNaN: nil,
+ LowerBound: &IntMin,
+ UpperBound: &IntMax,
+ },
+ { // all_nulls_missing_nan
+ ContainsNull: true,
+ ContainsNaN: nil,
+ LowerBound: nil,
+ UpperBound: nil,
+ },
+ { // some_nulls
+ ContainsNull: true,
+ ContainsNaN: nil,
+ LowerBound: &StringMin,
+ UpperBound: &StringMax,
+ },
+ { // no_nulls
+ ContainsNull: false,
+ ContainsNaN: nil,
+ LowerBound: &StringMin,
+ UpperBound: &StringMax,
+ },
+ { // float
+ ContainsNull: true,
+ ContainsNaN: nil,
+ LowerBound: &FloatMin,
+ UpperBound: &FloatMax,
+ },
+ { // all_nulls_double
+ ContainsNull: true,
+ ContainsNaN: nil,
+ LowerBound: nil,
+ UpperBound: nil,
+ },
+ { // all_nulls_no_nans
+ ContainsNull: true,
+ ContainsNaN: &NanFalse,
+ LowerBound: nil,
+ UpperBound: nil,
+ },
+ { // all_nans
+ ContainsNull: false,
+ ContainsNaN: &NanTrue,
+ LowerBound: nil,
+ UpperBound: nil,
+ },
+ { // both_nan_and_null
+ ContainsNull: true,
+ ContainsNaN: &NanTrue,
+ LowerBound: nil,
+ UpperBound: nil,
+ },
+ { // no_nan_or_null
+ ContainsNull: false,
+ ContainsNaN: &NanFalse,
+ LowerBound: &DblMin,
+ UpperBound: &DblMax,
+ },
+ { // all_nulls_missing_nan_float
+ ContainsNull: true,
+ ContainsNaN: nil,
+ LowerBound: nil,
+ UpperBound: nil,
+ },
+ { // all_same_value_or_null
+ ContainsNull: true,
+ ContainsNaN: nil,
+ LowerBound: &StringMin,
+ UpperBound: &StringMin,
+ },
+ { // no_nulls_same_value_a
+ ContainsNull: false,
+ ContainsNaN: nil,
+ LowerBound: &StringMin,
+ UpperBound: &StringMin,
+ },
+ }).Build()
+
+ t.Run("all nulls", func(t *testing.T) {
+ tests := []struct {
+ field string
+ expected bool
+ msg string
+ }{
+ {"all_nulls_missing_nan", false, "should skip: all
nulls column with non-floating type contains all null"},
+ {"all_nulls_missing_nan_float", true, "should read: no
NaN information may indicate presence of NaN value"},
+ {"some_nulls", true, "should read: column with some
nulls contains a non-null value"},
+ {"no_nulls", true, "should read: non-null column
contains a non-null value"},
+ }
+
+ for _, tt := range tests {
+ eval, err := newManifestEvaluator(spec, testSchema,
+ iceberg.NotNull(iceberg.Reference(tt.field)),
true)
+ require.NoError(t, err)
+
+ result, err := eval(manifest)
+ require.NoError(t, err)
+ assert.Equal(t, tt.expected, result, tt.msg)
+ }
+ })
+
+ t.Run("no nulls", func(t *testing.T) {
+ tests := []struct {
+ field string
+ expected bool
+ msg string
+ }{
+ {"all_nulls_missing_nan", true, "should read: at least
one null value in all null column"},
+ {"some_nulls", true, "should read: column with some
nulls contains a null value"},
+ {"no_nulls", false, "should skip: non-null column
contains no null values"},
+ {"both_nan_and_null", true, "should read:
both_nan_and_null column contains no null values"},
+ }
+
+ for _, tt := range tests {
+ eval, err := newManifestEvaluator(spec, testSchema,
+ iceberg.IsNull(iceberg.Reference(tt.field)),
true)
+ require.NoError(t, err)
+
+ result, err := eval(manifest)
+ require.NoError(t, err)
+ assert.Equal(t, tt.expected, result, tt.msg)
+ }
+ })
+
+ t.Run("is nan", func(t *testing.T) {
+ tests := []struct {
+ field string
+ expected bool
+ msg string
+ }{
+ {"float", true, "should read: no information on if
there are nan values in float column"},
+ {"all_nulls_double", true, "should read: no NaN
information may indicate presence of NaN value"},
+ {"all_nulls_missing_nan_float", true, "should read: no
NaN information may indicate presence of NaN value"},
+ {"all_nulls_no_nans", false, "should skip: no nan
column doesn't contain nan value"},
+ {"all_nans", true, "should read: all_nans column
contains nan value"},
+ {"both_nan_and_null", true, "should read:
both_nan_and_null column contains nan value"},
+ {"no_nan_or_null", false, "should skip: no_nan_or_null
column doesn't contain nan value"},
+ }
+
+ for _, tt := range tests {
+ eval, err := newManifestEvaluator(spec, testSchema,
+ iceberg.IsNaN(iceberg.Reference(tt.field)),
true)
+ require.NoError(t, err)
+
+ result, err := eval(manifest)
+ require.NoError(t, err)
+ assert.Equal(t, tt.expected, result, tt.msg)
+ }
+ })
+
+ t.Run("not nan", func(t *testing.T) {
+ tests := []struct {
+ field string
+ expected bool
+ msg string
+ }{
+ {"float", true, "should read: no information on if
there are nan values in float column"},
+ {"all_nulls_double", true, "should read: all null
column contains non nan value"},
+ {"all_nulls_no_nans", true, "should read: no_nans
column contains non nan value"},
+ {"all_nans", false, "should skip: all nans
columndoesn't contain non nan value"},
+ {"both_nan_and_null", true, "should read:
both_nan_and_null nans column contains non nan value"},
+ {"no_nan_or_null", true, "should read: no_nan_or_null
column contains non nan value"},
+ }
+
+ for _, tt := range tests {
+ eval, err := newManifestEvaluator(spec, testSchema,
+ iceberg.NotNaN(iceberg.Reference(tt.field)),
true)
+ require.NoError(t, err)
+
+ result, err := eval(manifest)
+ require.NoError(t, err)
+ assert.Equal(t, tt.expected, result, tt.msg)
+ }
+ })
+
+ t.Run("test missing stats", func(t *testing.T) {
+ exprs := []iceberg.BooleanExpression{
+ iceberg.LessThan(iceberg.Reference("id"), int32(5)),
+ iceberg.LessThanEqual(iceberg.Reference("id"),
int32(30)),
+ iceberg.EqualTo(iceberg.Reference("id"), int32(70)),
+ iceberg.GreaterThan(iceberg.Reference("id"), int32(78)),
+ iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(90)),
+ iceberg.NotEqualTo(iceberg.Reference("id"), int32(101)),
+ iceberg.IsNull(iceberg.Reference("id")),
+ iceberg.NotNull(iceberg.Reference("id")),
+ iceberg.IsNaN(iceberg.Reference("float")),
+ iceberg.NotNaN(iceberg.Reference("float")),
+ }
+
+ for _, tt := range exprs {
+ eval, err := newManifestEvaluator(spec, testSchema, tt,
true)
+ require.NoError(t, err)
+
+ result, err := eval(manifestNoStats)
+ require.NoError(t, err)
+ assert.Truef(t, result, "should read when missing stats
for expr: %s", tt)
+ }
+ })
+
+ t.Run("test exprs", func(t *testing.T) {
+ tests := []struct {
+ expr iceberg.BooleanExpression
+ expect bool
+ msg string
+ }{
+
{iceberg.NewNot(iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25))),
+ true, "should read: not(false)"},
+
{iceberg.NewNot(iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMinValue-25))),
+ false, "should skip: not(true)"},
+ {iceberg.NewAnd(
+ iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
+
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMinValue-30))),
+ false, "should skip: and(false, true)"},
+ {iceberg.NewAnd(
+ iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
+
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMaxValue+1))),
+ false, "should skip: and(false, false)"},
+ {iceberg.NewAnd(
+ iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
+ iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue))),
+ true, "should read: and(true, true)"},
+ {iceberg.NewOr(
+ iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
+
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMaxValue+1))),
+ false, "should skip: or(false, false)"},
+ {iceberg.NewOr(
+ iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
+
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMaxValue-19))),
+ true, "should read: or(false, true)"},
+ {iceberg.LessThan(iceberg.Reference("some_nulls"),
"1"), false,
+ "should not read: id range below lower bound"},
+ {iceberg.LessThan(iceberg.Reference("some_nulls"),
"b"), true,
+ "should read: lower bound in range"},
+ {iceberg.LessThan(iceberg.Reference("float"), 15.50),
true,
+ "should read: lower bound in range"},
+ {iceberg.LessThan(iceberg.Reference("no_nan_or_null"),
15.50), true,
+ "should read: lower bound in range"},
+
{iceberg.LessThanEqual(iceberg.Reference("no_nulls_same_value_a"), "a"), true,
+ "should read: lower bound in range"},
+ {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)), false,
+ "should not read: id range below lower bound (5
< 30)"},
+ {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue)), false,
+ "should not read: id range below lower bound
(30 is not < 30)"},
+ {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue+1)), true,
+ "should read: one possible id"},
+ {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMaxValue)), true,
+ "should read: many possible ids"},
+ {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue-25)), false,
+ "should not read: id range below lower bound (5
< 30)"},
+ {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue-1)), false,
+ "should not read: id range below lower bound 29
< 30"},
+ {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue)), true,
+ "should read: one possible id"},
+ {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMaxValue)), true,
+ "should read: many possible ids"},
+ {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue+6)), false,
+ "should not read: id range above upper bound
(85 < 79)"},
+ {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue)), false,
+ "should not read: id range above upper bound
(79 is not > 79)"},
+ {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue-1)), true,
+ "should read: one possible id"},
+ {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue-4)), true,
+ "should read: many possible ids"},
+ {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue+6)), false,
+ "should not read: id range is above upper bound
(85 < 79)"},
+ {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue+1)), false,
+ "should not read: id range above upper bound
(80 > 79)"},
+ {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue)), true,
+ "should read: one possible id"},
+ {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue)), true,
+ "should read: many possible ids"},
+ {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-25)), false,
+ "should not read: id below lower bound"},
+ {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-1)), false,
+ "should not read: id below lower bound"},
+ {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue)), true,
+ "should read: id equal to lower bound"},
+ {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue-4)), true,
+ "should read: id between lower and upper
bounds"},
+ {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue)), true,
+ "should read: id equal to upper bound"},
+ {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+1)), false,
+ "should not read: id above upper bound"},
+ {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+6)), false,
+ "should not read: id above upper bound"},
+ {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMinValue-25)), true,
+ "should read: id below lower bound"},
+ {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMinValue-1)), true,
+ "should read: id below lower bound"},
+ {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMinValue)), true,
+ "should read: id equal to lower bound"},
+ {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue-4)), true,
+ "should read: id between lower and upper
bounds"},
+ {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue)), true,
+ "should read: id equal to upper bound"},
+ {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue+1)), true,
+ "should read: id above upper bound"},
+ {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue+6)), true,
+ "should read: id above upper bound"},
+
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-25))), true,
+ "should read: id below lower bound"},
+
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-1))), true,
+ "should read: id below lower bound"},
+
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"), int32(IntMinValue))),
true,
+ "should read: id equal to lower bound"},
+
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue-4))), true,
+ "should read: id between lower and upper
bounds"},
+
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"), int32(IntMaxValue))),
true,
+ "should read: id equal to upper bound"},
+
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+1))), true,
+ "should read: id above upper bound"},
+
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+6))), true,
+ "should read: id above upper bound"},
+ {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMinValue-25), IntMinValue-24), false,
+ "should not read: id below lower bound (5 < 30,
6 < 30)"},
+ {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMinValue-2), IntMinValue-1), false,
+ "should not read: id below lower bound (28 <
30, 29 < 30)"},
+ {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMinValue-1), IntMinValue), true,
+ "should read: id equal to lower bound (30 ==
30)"},
+ {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue-4), IntMaxValue-3), true,
+ "should read: id between lower and upper bounds
(30 < 75 < 79, 30 < 76 < 79)"},
+ {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue), IntMaxValue+1), true,
+ "should read: id equal to upper bound (79 ==
79)"},
+ {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue+1), IntMaxValue+2), false,
+ "should not read: id above upper bound (80 >
79, 81 > 79)"},
+ {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue+6), IntMaxValue+7), false,
+ "should not read: id above upper bound (85 >
79, 86 > 79)"},
+
{iceberg.IsIn(iceberg.Reference("all_nulls_missing_nan"), "abc", "def"), false,
+ "should skip: in on all nulls column"},
+ {iceberg.IsIn(iceberg.Reference("some_nulls"), "abc",
"def"), true,
+ "should read: in on some nulls column"},
+ {iceberg.IsIn(iceberg.Reference("no_nulls"), "abc",
"def"), true,
+ "should read: in on no nulls column"},
+
{iceberg.IsIn(iceberg.Reference("no_nulls_same_value_a"), "a", "b"), true,
+ "should read: in on no nulls column"},
+ {iceberg.IsIn(iceberg.Reference("float"), 0, -5.5),
true,
+ "should read: float equal to lower bound"},
+ {iceberg.IsIn(iceberg.Reference("no_nan_or_null"), 0,
-5.5), true,
+ "should read: float equal to lower bound"},
+ {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMinValue-25), IntMinValue-24), true,
+ "should read: id below lower bound (5 < 30, 6 <
30)"},
+ {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMinValue-2), IntMinValue-1), true,
+ "should read: id below lower bound (28 < 30, 29
< 30)"},
+ {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMinValue-1), IntMinValue), true,
+ "should read: id equal to lower bound (30 ==
30)"},
+ {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue-4), IntMaxValue-3), true,
+ "should read: id between lower and upper bounds
(30 < 75 < 79, 30 < 76 < 79)"},
+ {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue), IntMaxValue+1), true,
+ "should read: id equal to upper bound (79 ==
79)"},
+ {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue+1), IntMaxValue+2), true,
+ "should read: id above upper bound (80 > 79, 81
> 79)"},
+ {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue+6), IntMaxValue+7), true,
+ "should read: id above upper bound (85 > 79, 86
> 79)"},
+
{iceberg.NotIn(iceberg.Reference("all_nulls_missing_nan"), "abc", "def"), true,
+ "should read: notIn on all nulls column"},
+ {iceberg.NotIn(iceberg.Reference("some_nulls"), "abc",
"def"), true,
+ "should read: notIn on some nulls column"},
+ {iceberg.NotIn(iceberg.Reference("no_nulls"), "abc",
"def"), true,
+ "should read: notIn on no nulls column"},
+ {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"a"), true,
+ "should read: range matches"},
+ {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"aa"), true,
+ "should read: range matches"},
+ {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"dddd"), true,
+ "should read: range matches"},
+ {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"z"), true,
+ "should read: range matches"},
+ {iceberg.StartsWith(iceberg.Reference("no_nulls"),
"a"), true,
+ "should read: range matches"},
+ {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"zzzz"), false,
+ "should skip: range doesn't match"},
+ {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"1"), false,
+ "should skip: range doesn't match"},
+
{iceberg.StartsWith(iceberg.Reference("no_nulls_same_value_a"), "a"), true,
+ "should read: all values start with the
prefix"},
+ {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"a"), true,
+ "should read: range matches"},
+ {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"aa"), true,
+ "should read: range matches"},
+ {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"dddd"), true,
+ "should read: range matches"},
+ {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"z"), true,
+ "should read: range matches"},
+ {iceberg.NotStartsWith(iceberg.Reference("no_nulls"),
"a"), true,
+ "should read: range matches"},
+ {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"zzzz"), true,
+ "should read: range matches"},
+ {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"1"), true,
+ "should read: range matches"},
+
{iceberg.NotStartsWith(iceberg.Reference("all_same_value_or_null"), "a"), true,
+ "should read: range matches"},
+
{iceberg.NotStartsWith(iceberg.Reference("all_same_value_or_null"), "aa"), true,
+ "should read: range matches"},
+
{iceberg.NotStartsWith(iceberg.Reference("all_same_value_or_null"), "A"), true,
+ "should read: range matches"},
+ // Iceberg does not implement SQL 3-way boolean logic,
so the choice of an
+ // all null column matching is by definition in order
to surface more values
+ // to the query engine to allow it to make its own
decision
+
{iceberg.NotStartsWith(iceberg.Reference("all_nulls_missing_nan"), "A"), true,
+ "should read: range matches"},
+
{iceberg.NotStartsWith(iceberg.Reference("no_nulls_same_value_a"), "a"), false,
+ "should not read: all values start with the
prefix"},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.expr.String(), func(t *testing.T) {
+ eval, err := newManifestEvaluator(spec,
testSchema,
+ tt.expr, true)
+ require.NoError(t, err)
+
+ result, err := eval(manifest)
+ require.NoError(t, err)
+ assert.Equal(t, tt.expect, result, tt.msg)
+ })
+ }
+ })
+}
diff --git a/visitors.go b/visitors.go
index 3428b2c..7525026 100644
--- a/visitors.go
+++ b/visitors.go
@@ -30,7 +30,7 @@ import (
type BooleanExprVisitor[T any] interface {
VisitTrue() T
VisitFalse() T
- VisitNot(childREsult T) T
+ VisitNot(childResult T) T
VisitAnd(left, right T) T
VisitOr(left, right T) T
VisitUnbound(UnboundPredicate) T
@@ -395,470 +395,3 @@ func (rewriteNotVisitor) VisitUnbound(pred
UnboundPredicate) BooleanExpression {
func (rewriteNotVisitor) VisitBound(pred BoundPredicate) BooleanExpression {
return pred
}
-
-const (
- rowsMightMatch, rowsMustMatch = true, true
- rowsCannotMatch, rowsMightNotMatch = false, false
- inPredicateLimit = 200
-)
-
-// NewManifestEvaluator returns a function that can be used to evaluate
whether a particular
-// manifest file has rows that might or might not match a given partition
filter by using
-// the stats provided in the partitions
(UpperBound/LowerBound/ContainsNull/ContainsNaN).
-func NewManifestEvaluator(spec PartitionSpec, schema *Schema, partitionFilter
BooleanExpression, caseSensitive bool) (func(ManifestFile) (bool, error),
error) {
- partType := spec.PartitionType(schema)
- partSchema := NewSchema(0, partType.FieldList...)
- filter, err := RewriteNotExpr(partitionFilter)
- if err != nil {
- return nil, err
- }
-
- boundFilter, err := BindExpr(partSchema, filter, caseSensitive)
- if err != nil {
- return nil, err
- }
-
- return (&manifestEvalVisitor{partitionFilter: boundFilter}).Eval, nil
-}
-
-type manifestEvalVisitor struct {
- partitionFields []FieldSummary
- partitionFilter BooleanExpression
-}
-
-func (m *manifestEvalVisitor) Eval(manifest ManifestFile) (bool, error) {
- if parts := manifest.Partitions(); len(parts) > 0 {
- m.partitionFields = parts
- return VisitExpr(m.partitionFilter, m)
- }
-
- return rowsMightMatch, nil
-}
-
-func allBoundCmp[T LiteralType](bound Literal, set Set[Literal], want int)
bool {
- val := bound.(TypedLiteral[T])
- cmp := val.Comparator()
-
- return set.All(func(e Literal) bool {
- return cmp(val.Value(), e.(TypedLiteral[T]).Value()) == want
- })
-}
-
-func allBoundCheck(bound Literal, set Set[Literal], want int) bool {
- switch bound.Type().(type) {
- case BooleanType:
- return allBoundCmp[bool](bound, set, want)
- case Int32Type:
- return allBoundCmp[int32](bound, set, want)
- case Int64Type:
- return allBoundCmp[int64](bound, set, want)
- case Float32Type:
- return allBoundCmp[float32](bound, set, want)
- case Float64Type:
- return allBoundCmp[float64](bound, set, want)
- case DateType:
- return allBoundCmp[Date](bound, set, want)
- case TimeType:
- return allBoundCmp[Time](bound, set, want)
- case TimestampType, TimestampTzType:
- return allBoundCmp[Timestamp](bound, set, want)
- case BinaryType, FixedType:
- return allBoundCmp[[]byte](bound, set, want)
- case StringType:
- return allBoundCmp[string](bound, set, want)
- case UUIDType:
- return allBoundCmp[uuid.UUID](bound, set, want)
- case DecimalType:
- return allBoundCmp[Decimal](bound, set, want)
- }
- panic(ErrType)
-}
-
-func (m *manifestEvalVisitor) VisitIn(term BoundTerm, literals Set[Literal])
bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.LowerBound == nil {
- return rowsCannotMatch
- }
-
- if literals.Len() > inPredicateLimit {
- return rowsMightMatch
- }
-
- lower, err := LiteralFromBytes(term.Type(), *field.LowerBound)
- if err != nil {
- panic(err)
- }
-
- if allBoundCheck(lower, literals, 1) {
- return rowsCannotMatch
- }
-
- if field.UpperBound != nil {
- upper, err := LiteralFromBytes(term.Type(), *field.UpperBound)
- if err != nil {
- panic(err)
- }
-
- if allBoundCheck(upper, literals, -1) {
- return rowsCannotMatch
- }
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitNotIn(term BoundTerm, literals
Set[Literal]) bool {
- // because the bounds are not necessarily a min or max value, this
cannot be answered using them
- // notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value
in col
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitIsNan(term BoundTerm) bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.ContainsNaN != nil && !*field.ContainsNaN {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitNotNan(term BoundTerm) bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.ContainsNaN != nil && *field.ContainsNaN &&
!field.ContainsNull && field.LowerBound == nil {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitIsNull(term BoundTerm) bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if !field.ContainsNull {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitNotNull(term BoundTerm) bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- // ContainsNull encodes whether at least one partition value is null
- // lowerBound is null if all partition values are null
- allNull := field.ContainsNull && field.LowerBound == nil
- if allNull && (term.Ref().Type().Equals(PrimitiveTypes.Float32) ||
term.Ref().Type().Equals(PrimitiveTypes.Float64)) {
- // floating point types may include NaN values, which we check
separately
- // in case bounds don't include NaN values, ContainsNaN needsz
to be checked
- allNull = field.ContainsNaN != nil && !*field.ContainsNaN
- }
-
- if allNull {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func getCmp[T LiteralType](b TypedLiteral[T]) func(Literal, Literal) int {
- cmp := b.Comparator()
- return func(l1, l2 Literal) int {
- return cmp(l1.(TypedLiteral[T]).Value(),
l2.(TypedLiteral[T]).Value())
- }
-}
-
-func getCmpLiteral(boundary Literal) func(Literal, Literal) int {
- switch l := boundary.(type) {
- case TypedLiteral[bool]:
- return getCmp(l)
- case TypedLiteral[int32]:
- return getCmp(l)
- case TypedLiteral[int64]:
- return getCmp(l)
- case TypedLiteral[float32]:
- return getCmp(l)
- case TypedLiteral[float64]:
- return getCmp(l)
- case TypedLiteral[Date]:
- return getCmp(l)
- case TypedLiteral[Time]:
- return getCmp(l)
- case TypedLiteral[Timestamp]:
- return getCmp(l)
- case TypedLiteral[[]byte]:
- return getCmp(l)
- case TypedLiteral[string]:
- return getCmp(l)
- case TypedLiteral[uuid.UUID]:
- return getCmp(l)
- case TypedLiteral[Decimal]:
- return getCmp(l)
- }
- panic(ErrType)
-}
-
-func (m *manifestEvalVisitor) VisitEqual(term BoundTerm, lit Literal) bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.LowerBound == nil || field.UpperBound == nil {
- // values are all null and literal cannot contain null
- return rowsCannotMatch
- }
-
- lower, err := LiteralFromBytes(term.Ref().Type(), *field.LowerBound)
- if err != nil {
- panic(err)
- }
-
- cmp := getCmpLiteral(lower)
- if cmp(lower, lit) == 1 {
- return rowsCannotMatch
- }
-
- upper, err := LiteralFromBytes(term.Ref().Type(), *field.UpperBound)
- if err != nil {
- panic(err)
- }
-
- if cmp(lit, upper) == 1 {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitNotEqual(term BoundTerm, lit Literal) bool {
- // because bounds are not necessarily a min or max, this cannot be
answered
- // using them. notEq(col, X) with (X, Y) doesn't guarantee X is a value
in col
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitGreaterEqual(term BoundTerm, lit Literal)
bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.UpperBound == nil {
- return rowsCannotMatch
- }
-
- upper, err := LiteralFromBytes(term.Ref().Type(), *field.UpperBound)
- if err != nil {
- panic(err)
- }
-
- if getCmpLiteral(upper)(lit, upper) == 1 {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitGreater(term BoundTerm, lit Literal) bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.UpperBound == nil {
- return rowsCannotMatch
- }
-
- upper, err := LiteralFromBytes(term.Ref().Type(), *field.UpperBound)
- if err != nil {
- panic(err)
- }
-
- if getCmpLiteral(upper)(lit, upper) >= 0 {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitLessEqual(term BoundTerm, lit Literal) bool
{
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.LowerBound == nil {
- return rowsCannotMatch
- }
-
- lower, err := LiteralFromBytes(term.Ref().Type(), *field.LowerBound)
- if err != nil {
- panic(err)
- }
-
- if getCmpLiteral(lower)(lit, lower) == -1 {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitLess(term BoundTerm, lit Literal) bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.LowerBound == nil {
- return rowsCannotMatch
- }
-
- lower, err := LiteralFromBytes(term.Ref().Type(), *field.LowerBound)
- if err != nil {
- panic(err)
- }
-
- if getCmpLiteral(lower)(lit, lower) <= 0 {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitStartsWith(term BoundTerm, lit Literal)
bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- var prefix string
- if val, ok := lit.(TypedLiteral[string]); ok {
- prefix = val.Value()
- } else {
- prefix = string(lit.(TypedLiteral[[]byte]).Value())
- }
-
- lenPrefix := len(prefix)
-
- if field.LowerBound == nil {
- return rowsCannotMatch
- }
-
- lower, err := LiteralFromBytes(term.Ref().Type(), *field.LowerBound)
- if err != nil {
- panic(err)
- }
-
- // truncate lower bound so that it's length is not greater than the
length of prefix
- var v string
- switch l := lower.(type) {
- case TypedLiteral[string]:
- v = l.Value()
- if len(v) > lenPrefix {
- v = v[:lenPrefix]
- }
- case TypedLiteral[[]byte]:
- v = string(l.Value())
- if len(v) > lenPrefix {
- v = v[:lenPrefix]
- }
- }
-
- if v > prefix {
- return rowsCannotMatch
- }
-
- if field.UpperBound == nil {
- return rowsCannotMatch
- }
-
- upper, err := LiteralFromBytes(term.Ref().Type(), *field.UpperBound)
- if err != nil {
- panic(err)
- }
-
- switch u := upper.(type) {
- case TypedLiteral[string]:
- v = u.Value()
- if len(v) > lenPrefix {
- v = v[:lenPrefix]
- }
- case TypedLiteral[[]byte]:
- v = string(u.Value())
- if len(v) > lenPrefix {
- v = v[:lenPrefix]
- }
- }
-
- if v < prefix {
- return rowsCannotMatch
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitNotStartsWith(term BoundTerm, lit Literal)
bool {
- pos := term.Ref().Pos()
- field := m.partitionFields[pos]
-
- if field.ContainsNull || field.LowerBound == nil || field.UpperBound ==
nil {
- return rowsMightMatch
- }
-
- // NotStartsWith will match unless ALL values must start with the
prefix.
- // this happens when the lower and upper bounds BOTH start with the
prefix
- lower, err := LiteralFromBytes(term.Ref().Type(), *field.LowerBound)
- if err != nil {
- panic(err)
- }
-
- upper, err := LiteralFromBytes(term.Ref().Type(), *field.UpperBound)
- if err != nil {
- panic(err)
- }
-
- var (
- prefix, lowerBound, upperBound string
- )
- if val, ok := lit.(TypedLiteral[string]); ok {
- prefix = val.Value()
- lowerBound, upperBound = lower.(TypedLiteral[string]).Value(),
upper.(TypedLiteral[string]).Value()
- } else {
- prefix = string(lit.(TypedLiteral[[]byte]).Value())
- lowerBound = string(lower.(TypedLiteral[[]byte]).Value())
- upperBound = string(upper.(TypedLiteral[[]byte]).Value())
- }
-
- lenPrefix := len(prefix)
- if len(lowerBound) < lenPrefix {
- return rowsMightMatch
- }
-
- if lowerBound[:lenPrefix] == prefix {
- // if upper is shorter then upper can't start with the prefix
- if len(upperBound) < lenPrefix {
- return rowsMightMatch
- }
-
- if upperBound[:lenPrefix] == prefix {
- return rowsCannotMatch
- }
- }
-
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitTrue() bool {
- return rowsMightMatch
-}
-
-func (m *manifestEvalVisitor) VisitFalse() bool {
- return rowsCannotMatch
-}
-
-func (m *manifestEvalVisitor) VisitUnbound(UnboundPredicate) bool {
- panic("need bound predicate")
-}
-
-func (m *manifestEvalVisitor) VisitBound(pred BoundPredicate) bool {
- return VisitBoundPredicate(pred, m)
-}
-
-func (m *manifestEvalVisitor) VisitNot(child bool) bool { return !child }
-func (m *manifestEvalVisitor) VisitAnd(left, right bool) bool { return left &&
right }
-func (m *manifestEvalVisitor) VisitOr(left, right bool) bool { return left ||
right }
diff --git a/visitors_test.go b/visitors_test.go
index 688c1cc..8b44236 100644
--- a/visitors_test.go
+++ b/visitors_test.go
@@ -576,484 +576,6 @@ func TestEvaluatorCmpTypes(t *testing.T) {
}
}
-func TestManifestEvaluator(t *testing.T) {
- const (
- IntMinValue, IntMaxValue = 30, 79
- )
-
- var (
- IntMin, IntMax = []byte{byte(IntMinValue), 0x00, 0x00,
0x00}, []byte{byte(IntMaxValue), 0x00, 0x00, 0x00}
- StringMin, StringMax = []byte("a"), []byte("z")
- FloatMin, _ = iceberg.Float32Literal(0).MarshalBinary()
- FloatMax, _ =
iceberg.Float32Literal(20).MarshalBinary()
- DblMin, _ = iceberg.Float64Literal(0).MarshalBinary()
- DblMax, _ =
iceberg.Float64Literal(20).MarshalBinary()
- NanTrue, NanFalse = true, false
-
- testSchema = iceberg.NewSchema(1,
- iceberg.NestedField{ID: 1, Name: "id",
- Type: iceberg.PrimitiveTypes.Int32, Required:
true},
- iceberg.NestedField{ID: 2, Name:
"all_nulls_missing_nan",
- Type: iceberg.PrimitiveTypes.String, Required:
false},
- iceberg.NestedField{ID: 3, Name: "some_nulls",
- Type: iceberg.PrimitiveTypes.String, Required:
false},
- iceberg.NestedField{ID: 4, Name: "no_nulls",
- Type: iceberg.PrimitiveTypes.String, Required:
false},
- iceberg.NestedField{ID: 5, Name: "float",
- Type: iceberg.PrimitiveTypes.Float32, Required:
false},
- iceberg.NestedField{ID: 6, Name: "all_nulls_double",
- Type: iceberg.PrimitiveTypes.Float64, Required:
false},
- iceberg.NestedField{ID: 7, Name: "all_nulls_no_nans",
- Type: iceberg.PrimitiveTypes.Float32, Required:
false},
- iceberg.NestedField{ID: 8, Name: "all_nans",
- Type: iceberg.PrimitiveTypes.Float64, Required:
false},
- iceberg.NestedField{ID: 9, Name: "both_nan_and_null",
- Type: iceberg.PrimitiveTypes.Float32, Required:
false},
- iceberg.NestedField{ID: 10, Name: "no_nan_or_null",
- Type: iceberg.PrimitiveTypes.Float64, Required:
false},
- iceberg.NestedField{ID: 11, Name:
"all_nulls_missing_nan_float",
- Type: iceberg.PrimitiveTypes.Float32, Required:
false},
- iceberg.NestedField{ID: 12, Name:
"all_same_value_or_null",
- Type: iceberg.PrimitiveTypes.String, Required:
false},
- iceberg.NestedField{ID: 13, Name:
"no_nulls_same_value_a",
- Type: iceberg.PrimitiveTypes.Binary, Required:
false},
- )
- )
-
- partFields := make([]iceberg.PartitionField, 0, testSchema.NumFields())
- for _, f := range testSchema.Fields() {
- partFields = append(partFields, iceberg.PartitionField{
- Name: f.Name,
- SourceID: f.ID,
- FieldID: f.ID,
- Transform: iceberg.IdentityTransform{},
- })
- }
-
- spec := iceberg.NewPartitionSpec(partFields...)
- manifestNoStats := iceberg.NewManifestV1Builder("", 0, 0, 0).Build()
- manifest := iceberg.NewManifestV1Builder("", 0, 0, 0).Partitions(
- []iceberg.FieldSummary{
- { // id
- ContainsNull: false,
- ContainsNaN: nil,
- LowerBound: &IntMin,
- UpperBound: &IntMax,
- },
- { // all_nulls_missing_nan
- ContainsNull: true,
- ContainsNaN: nil,
- LowerBound: nil,
- UpperBound: nil,
- },
- { // some_nulls
- ContainsNull: true,
- ContainsNaN: nil,
- LowerBound: &StringMin,
- UpperBound: &StringMax,
- },
- { // no_nulls
- ContainsNull: false,
- ContainsNaN: nil,
- LowerBound: &StringMin,
- UpperBound: &StringMax,
- },
- { // float
- ContainsNull: true,
- ContainsNaN: nil,
- LowerBound: &FloatMin,
- UpperBound: &FloatMax,
- },
- { // all_nulls_double
- ContainsNull: true,
- ContainsNaN: nil,
- LowerBound: nil,
- UpperBound: nil,
- },
- { // all_nulls_no_nans
- ContainsNull: true,
- ContainsNaN: &NanFalse,
- LowerBound: nil,
- UpperBound: nil,
- },
- { // all_nans
- ContainsNull: false,
- ContainsNaN: &NanTrue,
- LowerBound: nil,
- UpperBound: nil,
- },
- { // both_nan_and_null
- ContainsNull: true,
- ContainsNaN: &NanTrue,
- LowerBound: nil,
- UpperBound: nil,
- },
- { // no_nan_or_null
- ContainsNull: false,
- ContainsNaN: &NanFalse,
- LowerBound: &DblMin,
- UpperBound: &DblMax,
- },
- { // all_nulls_missing_nan_float
- ContainsNull: true,
- ContainsNaN: nil,
- LowerBound: nil,
- UpperBound: nil,
- },
- { // all_same_value_or_null
- ContainsNull: true,
- ContainsNaN: nil,
- LowerBound: &StringMin,
- UpperBound: &StringMin,
- },
- { // no_nulls_same_value_a
- ContainsNull: false,
- ContainsNaN: nil,
- LowerBound: &StringMin,
- UpperBound: &StringMin,
- },
- }).Build()
-
- t.Run("all nulls", func(t *testing.T) {
- tests := []struct {
- field string
- expected bool
- msg string
- }{
- {"all_nulls_missing_nan", false, "should skip: all
nulls column with non-floating type contains all null"},
- {"all_nulls_missing_nan_float", true, "should read: no
NaN information may indicate presence of NaN value"},
- {"some_nulls", true, "should read: column with some
nulls contains a non-null value"},
- {"no_nulls", true, "should read: non-null column
contains a non-null value"},
- }
-
- for _, tt := range tests {
- eval, err := iceberg.NewManifestEvaluator(spec,
testSchema,
- iceberg.NotNull(iceberg.Reference(tt.field)),
true)
- require.NoError(t, err)
-
- result, err := eval(manifest)
- require.NoError(t, err)
- assert.Equal(t, tt.expected, result, tt.msg)
- }
- })
-
- t.Run("no nulls", func(t *testing.T) {
- tests := []struct {
- field string
- expected bool
- msg string
- }{
- {"all_nulls_missing_nan", true, "should read: at least
one null value in all null column"},
- {"some_nulls", true, "should read: column with some
nulls contains a null value"},
- {"no_nulls", false, "should skip: non-null column
contains no null values"},
- {"both_nan_and_null", true, "should read:
both_nan_and_null column contains no null values"},
- }
-
- for _, tt := range tests {
- eval, err := iceberg.NewManifestEvaluator(spec,
testSchema,
- iceberg.IsNull(iceberg.Reference(tt.field)),
true)
- require.NoError(t, err)
-
- result, err := eval(manifest)
- require.NoError(t, err)
- assert.Equal(t, tt.expected, result, tt.msg)
- }
- })
-
- t.Run("is nan", func(t *testing.T) {
- tests := []struct {
- field string
- expected bool
- msg string
- }{
- {"float", true, "should read: no information on if
there are nan values in float column"},
- {"all_nulls_double", true, "should read: no NaN
information may indicate presence of NaN value"},
- {"all_nulls_missing_nan_float", true, "should read: no
NaN information may indicate presence of NaN value"},
- {"all_nulls_no_nans", false, "should skip: no nan
column doesn't contain nan value"},
- {"all_nans", true, "should read: all_nans column
contains nan value"},
- {"both_nan_and_null", true, "should read:
both_nan_and_null column contains nan value"},
- {"no_nan_or_null", false, "should skip: no_nan_or_null
column doesn't contain nan value"},
- }
-
- for _, tt := range tests {
- eval, err := iceberg.NewManifestEvaluator(spec,
testSchema,
- iceberg.IsNaN(iceberg.Reference(tt.field)),
true)
- require.NoError(t, err)
-
- result, err := eval(manifest)
- require.NoError(t, err)
- assert.Equal(t, tt.expected, result, tt.msg)
- }
- })
-
- t.Run("not nan", func(t *testing.T) {
- tests := []struct {
- field string
- expected bool
- msg string
- }{
- {"float", true, "should read: no information on if
there are nan values in float column"},
- {"all_nulls_double", true, "should read: all null
column contains non nan value"},
- {"all_nulls_no_nans", true, "should read: no_nans
column contains non nan value"},
- {"all_nans", false, "should skip: all nans
columndoesn't contain non nan value"},
- {"both_nan_and_null", true, "should read:
both_nan_and_null nans column contains non nan value"},
- {"no_nan_or_null", true, "should read: no_nan_or_null
column contains non nan value"},
- }
-
- for _, tt := range tests {
- eval, err := iceberg.NewManifestEvaluator(spec,
testSchema,
- iceberg.NotNaN(iceberg.Reference(tt.field)),
true)
- require.NoError(t, err)
-
- result, err := eval(manifest)
- require.NoError(t, err)
- assert.Equal(t, tt.expected, result, tt.msg)
- }
- })
-
- t.Run("test missing stats", func(t *testing.T) {
- exprs := []iceberg.BooleanExpression{
- iceberg.LessThan(iceberg.Reference("id"), int32(5)),
- iceberg.LessThanEqual(iceberg.Reference("id"),
int32(30)),
- iceberg.EqualTo(iceberg.Reference("id"), int32(70)),
- iceberg.GreaterThan(iceberg.Reference("id"), int32(78)),
- iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(90)),
- iceberg.NotEqualTo(iceberg.Reference("id"), int32(101)),
- iceberg.IsNull(iceberg.Reference("id")),
- iceberg.NotNull(iceberg.Reference("id")),
- iceberg.IsNaN(iceberg.Reference("float")),
- iceberg.NotNaN(iceberg.Reference("float")),
- }
-
- for _, tt := range exprs {
- eval, err := iceberg.NewManifestEvaluator(spec,
testSchema, tt, true)
- require.NoError(t, err)
-
- result, err := eval(manifestNoStats)
- require.NoError(t, err)
- assert.Truef(t, result, "should read when missing stats
for expr: %s", tt)
- }
- })
-
- t.Run("test exprs", func(t *testing.T) {
- tests := []struct {
- expr iceberg.BooleanExpression
- expect bool
- msg string
- }{
-
{iceberg.NewNot(iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25))),
- true, "should read: not(false)"},
-
{iceberg.NewNot(iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMinValue-25))),
- false, "should skip: not(true)"},
- {iceberg.NewAnd(
- iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
-
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMinValue-30))),
- false, "should skip: and(false, true)"},
- {iceberg.NewAnd(
- iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
-
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMaxValue+1))),
- false, "should skip: and(false, false)"},
- {iceberg.NewAnd(
- iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
- iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue))),
- true, "should read: and(true, true)"},
- {iceberg.NewOr(
- iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
-
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMaxValue+1))),
- false, "should skip: or(false, false)"},
- {iceberg.NewOr(
- iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)),
-
iceberg.GreaterThanEqual(iceberg.Reference("id"), int32(IntMaxValue-19))),
- true, "should read: or(false, true)"},
- {iceberg.LessThan(iceberg.Reference("some_nulls"),
"1"), false,
- "should not read: id range below lower bound"},
- {iceberg.LessThan(iceberg.Reference("some_nulls"),
"b"), true,
- "should read: lower bound in range"},
- {iceberg.LessThan(iceberg.Reference("float"), 15.50),
true,
- "should read: lower bound in range"},
- {iceberg.LessThan(iceberg.Reference("no_nan_or_null"),
15.50), true,
- "should read: lower bound in range"},
-
{iceberg.LessThanEqual(iceberg.Reference("no_nulls_same_value_a"), "a"), true,
- "should read: lower bound in range"},
- {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue-25)), false,
- "should not read: id range below lower bound (5
< 30)"},
- {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue)), false,
- "should not read: id range below lower bound
(30 is not < 30)"},
- {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMinValue+1)), true,
- "should read: one possible id"},
- {iceberg.LessThan(iceberg.Reference("id"),
int32(IntMaxValue)), true,
- "should read: many possible ids"},
- {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue-25)), false,
- "should not read: id range below lower bound (5
< 30)"},
- {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue-1)), false,
- "should not read: id range below lower bound 29
< 30"},
- {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMinValue)), true,
- "should read: one possible id"},
- {iceberg.LessThanEqual(iceberg.Reference("id"),
int32(IntMaxValue)), true,
- "should read: many possible ids"},
- {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue+6)), false,
- "should not read: id range above upper bound
(85 < 79)"},
- {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue)), false,
- "should not read: id range above upper bound
(79 is not > 79)"},
- {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue-1)), true,
- "should read: one possible id"},
- {iceberg.GreaterThan(iceberg.Reference("id"),
int32(IntMaxValue-4)), true,
- "should read: many possible ids"},
- {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue+6)), false,
- "should not read: id range is above upper bound
(85 < 79)"},
- {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue+1)), false,
- "should not read: id range above upper bound
(80 > 79)"},
- {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue)), true,
- "should read: one possible id"},
- {iceberg.GreaterThanEqual(iceberg.Reference("id"),
int32(IntMaxValue)), true,
- "should read: many possible ids"},
- {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-25)), false,
- "should not read: id below lower bound"},
- {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-1)), false,
- "should not read: id below lower bound"},
- {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue)), true,
- "should read: id equal to lower bound"},
- {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue-4)), true,
- "should read: id between lower and upper
bounds"},
- {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue)), true,
- "should read: id equal to upper bound"},
- {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+1)), false,
- "should not read: id above upper bound"},
- {iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+6)), false,
- "should not read: id above upper bound"},
- {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMinValue-25)), true,
- "should read: id below lower bound"},
- {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMinValue-1)), true,
- "should read: id below lower bound"},
- {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMinValue)), true,
- "should read: id equal to lower bound"},
- {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue-4)), true,
- "should read: id between lower and upper
bounds"},
- {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue)), true,
- "should read: id equal to upper bound"},
- {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue+1)), true,
- "should read: id above upper bound"},
- {iceberg.NotEqualTo(iceberg.Reference("id"),
int32(IntMaxValue+6)), true,
- "should read: id above upper bound"},
-
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-25))), true,
- "should read: id below lower bound"},
-
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMinValue-1))), true,
- "should read: id below lower bound"},
-
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"), int32(IntMinValue))),
true,
- "should read: id equal to lower bound"},
-
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue-4))), true,
- "should read: id between lower and upper
bounds"},
-
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"), int32(IntMaxValue))),
true,
- "should read: id equal to upper bound"},
-
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+1))), true,
- "should read: id above upper bound"},
-
{iceberg.NewNot(iceberg.EqualTo(iceberg.Reference("id"),
int32(IntMaxValue+6))), true,
- "should read: id above upper bound"},
- {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMinValue-25), IntMinValue-24), false,
- "should not read: id below lower bound (5 < 30,
6 < 30)"},
- {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMinValue-2), IntMinValue-1), false,
- "should not read: id below lower bound (28 <
30, 29 < 30)"},
- {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMinValue-1), IntMinValue), true,
- "should read: id equal to lower bound (30 ==
30)"},
- {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue-4), IntMaxValue-3), true,
- "should read: id between lower and upper bounds
(30 < 75 < 79, 30 < 76 < 79)"},
- {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue), IntMaxValue+1), true,
- "should read: id equal to upper bound (79 ==
79)"},
- {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue+1), IntMaxValue+2), false,
- "should not read: id above upper bound (80 >
79, 81 > 79)"},
- {iceberg.IsIn(iceberg.Reference("id"),
int32(IntMaxValue+6), IntMaxValue+7), false,
- "should not read: id above upper bound (85 >
79, 86 > 79)"},
-
{iceberg.IsIn(iceberg.Reference("all_nulls_missing_nan"), "abc", "def"), false,
- "should skip: in on all nulls column"},
- {iceberg.IsIn(iceberg.Reference("some_nulls"), "abc",
"def"), true,
- "should read: in on some nulls column"},
- {iceberg.IsIn(iceberg.Reference("no_nulls"), "abc",
"def"), true,
- "should read: in on no nulls column"},
-
{iceberg.IsIn(iceberg.Reference("no_nulls_same_value_a"), "a", "b"), true,
- "should read: in on no nulls column"},
- {iceberg.IsIn(iceberg.Reference("float"), 0, -5.5),
true,
- "should read: float equal to lower bound"},
- {iceberg.IsIn(iceberg.Reference("no_nan_or_null"), 0,
-5.5), true,
- "should read: float equal to lower bound"},
- {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMinValue-25), IntMinValue-24), true,
- "should read: id below lower bound (5 < 30, 6 <
30)"},
- {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMinValue-2), IntMinValue-1), true,
- "should read: id below lower bound (28 < 30, 29
< 30)"},
- {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMinValue-1), IntMinValue), true,
- "should read: id equal to lower bound (30 ==
30)"},
- {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue-4), IntMaxValue-3), true,
- "should read: id between lower and upper bounds
(30 < 75 < 79, 30 < 76 < 79)"},
- {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue), IntMaxValue+1), true,
- "should read: id equal to upper bound (79 ==
79)"},
- {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue+1), IntMaxValue+2), true,
- "should read: id above upper bound (80 > 79, 81
> 79)"},
- {iceberg.NotIn(iceberg.Reference("id"),
int32(IntMaxValue+6), IntMaxValue+7), true,
- "should read: id above upper bound (85 > 79, 86
> 79)"},
-
{iceberg.NotIn(iceberg.Reference("all_nulls_missing_nan"), "abc", "def"), true,
- "should read: notIn on all nulls column"},
- {iceberg.NotIn(iceberg.Reference("some_nulls"), "abc",
"def"), true,
- "should read: notIn on some nulls column"},
- {iceberg.NotIn(iceberg.Reference("no_nulls"), "abc",
"def"), true,
- "should read: notIn on no nulls column"},
- {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"a"), true,
- "should read: range matches"},
- {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"aa"), true,
- "should read: range matches"},
- {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"dddd"), true,
- "should read: range matches"},
- {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"z"), true,
- "should read: range matches"},
- {iceberg.StartsWith(iceberg.Reference("no_nulls"),
"a"), true,
- "should read: range matches"},
- {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"zzzz"), false,
- "should skip: range doesn't match"},
- {iceberg.StartsWith(iceberg.Reference("some_nulls"),
"1"), false,
- "should skip: range doesn't match"},
-
{iceberg.StartsWith(iceberg.Reference("no_nulls_same_value_a"), "a"), true,
- "should read: all values start with the
prefix"},
- {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"a"), true,
- "should read: range matches"},
- {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"aa"), true,
- "should read: range matches"},
- {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"dddd"), true,
- "should read: range matches"},
- {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"z"), true,
- "should read: range matches"},
- {iceberg.NotStartsWith(iceberg.Reference("no_nulls"),
"a"), true,
- "should read: range matches"},
- {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"zzzz"), true,
- "should read: range matches"},
- {iceberg.NotStartsWith(iceberg.Reference("some_nulls"),
"1"), true,
- "should read: range matches"},
-
{iceberg.NotStartsWith(iceberg.Reference("all_same_value_or_null"), "a"), true,
- "should read: range matches"},
-
{iceberg.NotStartsWith(iceberg.Reference("all_same_value_or_null"), "aa"), true,
- "should read: range matches"},
-
{iceberg.NotStartsWith(iceberg.Reference("all_same_value_or_null"), "A"), true,
- "should read: range matches"},
- // Iceberg does not implement SQL 3-way boolean logic,
so the choice of an
- // all null column matching is by definition in order
to surface more values
- // to the query engine to allow it to make its own
decision
-
{iceberg.NotStartsWith(iceberg.Reference("all_nulls_missing_nan"), "A"), true,
- "should read: range matches"},
-
{iceberg.NotStartsWith(iceberg.Reference("no_nulls_same_value_a"), "a"), false,
- "should not read: all values start with the
prefix"},
- }
-
- for _, tt := range tests {
- t.Run(tt.expr.String(), func(t *testing.T) {
- eval, err := iceberg.NewManifestEvaluator(spec,
testSchema,
- tt.expr, true)
- require.NoError(t, err)
-
- result, err := eval(manifest)
- require.NoError(t, err)
- assert.Equal(t, tt.expect, result, tt.msg)
- })
- }
- })
-}
-
func TestRewriteNot(t *testing.T) {
tests := []struct {
expr, expected iceberg.BooleanExpression