This is an automated email from the ASF dual-hosted git repository.
dweeks pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new b847800 Backporting case-sensitivity flag to expression module (#196)
b847800 is described below
commit b847800ff7df9737ea99ab3d54c04e4188bd5a0a
Author: TGooch44 <[email protected]>
AuthorDate: Fri May 31 12:27:18 2019 -0700
Backporting case-sensitivity flag to expression module (#196)
* [WIP] Initial python implementation commit
* Updating PR per comments from @xhochy
* Backporting case-insensitve expressions
---
python/iceberg/api/exceptions/__init__.py | 23 ++--
python/iceberg/api/exceptions/already_exists.py | 19 ---
.../iceberg/api/exceptions/validation_exception.py | 26 ----
python/iceberg/api/expressions/__init__.py | 8 ++
python/iceberg/api/expressions/binder.py | 13 +-
python/iceberg/api/expressions/evaluator.py | 4 +-
.../api/expressions/inclusive_metrics_evaluator.py | 15 +--
python/iceberg/api/expressions/predicate.py | 8 +-
python/iceberg/api/expressions/projections.py | 114 +++++++++++++++++
python/iceberg/api/expressions/reference.py | 4 +
.../api/expressions/strict_metrics_evaluator.py | 10 +-
python/iceberg/api/schema.py | 61 +++++----
python/iceberg/api/table_scan.py | 17 ++-
python/iceberg/api/types/check_compatibility.py | 0
python/iceberg/api/types/types.py | 35 ++++--
python/tests/api/expressions/__init__.py | 1 -
python/tests/api/expressions/conftest.py | 140 +++++++++++++++++++--
python/tests/api/expressions/test_evaluator.py | 18 +++
.../api/expressions/test_expression_binding.py | 12 ++
.../test_inclusive_metrics_evaluator.py | 12 ++
20 files changed, 415 insertions(+), 125 deletions(-)
diff --git a/python/iceberg/api/exceptions/__init__.py
b/python/iceberg/api/exceptions/__init__.py
index d0d7056..e6810ef 100644
--- a/python/iceberg/api/exceptions/__init__.py
+++ b/python/iceberg/api/exceptions/__init__.py
@@ -6,14 +6,21 @@
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
-# http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+__all__ = ["AlreadyExistsException",
+ "CommitFailedException",
+ "NoSuchTableException",
+ "ValidationException"]
-from .already_exists import AlreadyExists # noqa
-from .validation_exception import ValidationException # noqa
\ No newline at end of file
+from .exceptions import (AlreadyExistsException,
+ CommitFailedException,
+ NoSuchTableException,
+ ValidationException)
diff --git a/python/iceberg/api/exceptions/already_exists.py
b/python/iceberg/api/exceptions/already_exists.py
deleted file mode 100644
index e92de04..0000000
--- a/python/iceberg/api/exceptions/already_exists.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class AlreadyExists(Exception):
- pass
diff --git a/python/iceberg/api/exceptions/validation_exception.py
b/python/iceberg/api/exceptions/validation_exception.py
deleted file mode 100644
index 7261e6b..0000000
--- a/python/iceberg/api/exceptions/validation_exception.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class ValidationException(RuntimeError):
-
- @staticmethod
- def check(test, message, args):
- if not test:
- raise ValidationException(message, args)
-
- def __init__(self, message, args):
- super(ValidationException, self).__init__(message % args)
diff --git a/python/iceberg/api/expressions/__init__.py
b/python/iceberg/api/expressions/__init__.py
index 126bdd7..fdc5448 100644
--- a/python/iceberg/api/expressions/__init__.py
+++ b/python/iceberg/api/expressions/__init__.py
@@ -36,7 +36,9 @@ __all__ = ["ABOVE_MAX",
"FalseExp",
"FixedLiteral",
"FloatLiteral",
+ "inclusive",
"InclusiveMetricsEvaluator",
+ "InclusiveProjection",
"IntegerLiteral",
"JAVA_MAX_FLOAT",
"JAVA_MAX_INT",
@@ -50,7 +52,9 @@ __all__ = ["ABOVE_MAX",
"Or",
"Predicate",
"Reference",
+ "strict",
"StrictMetricsEvaluator",
+ "StrictProjection",
"StringLiteral",
"TRUE",
"TrueExp",
@@ -91,6 +95,10 @@ from .literals import (ABOVE_MAX,
from .predicate import (BoundPredicate,
Predicate,
UnboundPredicate)
+from .projections import (inclusive,
+ InclusiveProjection,
+ strict,
+ StrictProjection)
from .reference import (BoundReference,
NamedReference,
Reference)
diff --git a/python/iceberg/api/expressions/binder.py
b/python/iceberg/api/expressions/binder.py
index ad5921c..3be2d46 100644
--- a/python/iceberg/api/expressions/binder.py
+++ b/python/iceberg/api/expressions/binder.py
@@ -22,16 +22,16 @@ from .predicate import BoundPredicate
class Binder(object):
@staticmethod
- def bind(struct, expr):
- return ExpressionVisitors.visit(expr, Binder.BindVisitor(struct))
+ def bind(struct, expr, case_sensitive=True):
+ return ExpressionVisitors.visit(expr, Binder.BindVisitor(struct,
case_sensitive))
@staticmethod
- def bound_references(struct, exprs):
+ def bound_references(struct, exprs, case_sensitive=True):
if exprs is None:
return set()
visitor = Binder.ReferenceVisitor()
for expr in exprs:
- ExpressionVisitors.visit(Binder.bind(struct, expr), visitor)
+ ExpressionVisitors.visit(Binder.bind(struct, expr,
case_sensitive), visitor)
return visitor.references
@@ -40,8 +40,9 @@ class Binder(object):
class BindVisitor(ExpressionVisitors.ExpressionVisitor):
- def __init__(self, struct):
+ def __init__(self, struct, case_sensitive=True):
self.struct = struct
+ self.case_sensitive = case_sensitive
def always_true(self):
return Expressions.always_true()
@@ -62,7 +63,7 @@ class Binder(object):
if isinstance(pred, BoundPredicate):
raise RuntimeError("Found already bound predicate:
{}".format(pred))
- return pred.bind(self.struct)
+ return pred.bind(self.struct, self.case_sensitive)
class ReferenceVisitor(ExpressionVisitors.ExpressionVisitor):
diff --git a/python/iceberg/api/expressions/evaluator.py
b/python/iceberg/api/expressions/evaluator.py
index d4bf269..340f352 100644
--- a/python/iceberg/api/expressions/evaluator.py
+++ b/python/iceberg/api/expressions/evaluator.py
@@ -21,8 +21,8 @@ from .expressions import ExpressionVisitors
class Evaluator(object):
- def __init__(self, struct, unbound):
- self.expr = Binder.bind(struct, unbound)
+ def __init__(self, struct, unbound, case_sensitive=True):
+ self.expr = Binder.bind(struct, unbound, case_sensitive)
self.visitors = None
def visitor(self):
diff --git a/python/iceberg/api/expressions/inclusive_metrics_evaluator.py
b/python/iceberg/api/expressions/inclusive_metrics_evaluator.py
index 5b943e2..3134831 100644
--- a/python/iceberg/api/expressions/inclusive_metrics_evaluator.py
+++ b/python/iceberg/api/expressions/inclusive_metrics_evaluator.py
@@ -22,10 +22,11 @@ from ..types import Conversions
class InclusiveMetricsEvaluator(object):
- def __init__(self, schema, unbound):
+ def __init__(self, schema, unbound, case_sensitive=True):
self.schema = schema
self.struct = schema.as_struct()
- self.expr = Binder.bind(self.struct, Expressions.rewrite_not(unbound))
+ self.case_sensitive = case_sensitive
+ self.expr = Binder.bind(self.struct, Expressions.rewrite_not(unbound),
case_sensitive)
self._visitors = None
def _visitor(self):
@@ -52,13 +53,13 @@ class
MetricsEvalVisitor(ExpressionVisitors.BoundExpressionVisitor):
self.struct = struct
def eval(self, file):
- if file.record_count <= 0:
+ if file.record_count() <= 0:
return MetricsEvalVisitor.ROWS_CANNOT_MATCH
- self.value_counts = file.value_counts
- self.null_counts = file.null_value_counts
- self.lower_bounds = file.lower_bounds
- self.upper_bounds = file.upper_bounds
+ self.value_counts = file.value_counts()
+ self.null_counts = file.null_value_counts()
+ self.lower_bounds = file.lower_bounds()
+ self.upper_bounds = file.upper_bounds()
return ExpressionVisitors.visit(self.expr, self)
diff --git a/python/iceberg/api/expressions/predicate.py
b/python/iceberg/api/expressions/predicate.py
index 55625dc..4558dc9 100644
--- a/python/iceberg/api/expressions/predicate.py
+++ b/python/iceberg/api/expressions/predicate.py
@@ -93,8 +93,12 @@ class UnboundPredicate(Predicate):
def negate(self):
return UnboundPredicate(self.op.negate(), self.ref, self.lit)
- def bind(self, struct): # noqa: C901
- field = struct.field(self.ref.name)
+ def bind(self, struct, case_sensitive=True): # noqa: C901
+ if case_sensitive:
+ field = struct.field(self.ref.name)
+ else:
+ field = struct.case_insensitive_field(self.ref.name.lower())
+
ValidationException.check(field is not None,
"Cannot find field '%s' in struct %s",
(self.ref.name, struct))
diff --git a/python/iceberg/api/expressions/projections.py
b/python/iceberg/api/expressions/projections.py
new file mode 100644
index 0000000..1a4a6bd
--- /dev/null
+++ b/python/iceberg/api/expressions/projections.py
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from .expressions import Expressions, ExpressionVisitors, RewriteNot
+from .predicate import BoundPredicate, UnboundPredicate
+
+
+def inclusive(spec, case_sensitive=True):
+ return InclusiveProjection(spec, case_sensitive)
+
+
+def strict(spec):
+ return StrictProjection(spec)
+
+
+class ProjectionEvaluator(ExpressionVisitors.ExpressionVisitor):
+
+ def project(self, expr):
+ raise NotImplementedError()
+
+
+class BaseProjectionEvaluator(ProjectionEvaluator):
+
+ def __init__(self, spec, case_sensitive=True):
+ self.spec = spec
+ self.case_sensitive = case_sensitive
+
+ def project(self, expr):
+ # projections assume that there are no NOT nodes in the expression
tree. to ensure that this
+ # is the case, the expression is rewritten to push all NOT nodes down
to the expression
+ # leaf nodes.
+ # this is necessary to ensure that the default expression returned
when a predicate can't be
+ # projected is correct.
+ #
+ return ExpressionVisitors.visit(ExpressionVisitors.visit(expr,
RewriteNot.get()), self)
+
+ def always_true(self):
+ return Expressions.always_true()
+
+ def always_false(self):
+ return Expressions.always_false()
+
+ def not_(self, result):
+ raise RuntimeError("[BUG] project called on expression with a not")
+
+ def and_(self, left_result, right_result):
+ return Expressions.and_(left_result, right_result)
+
+ def or_(self, left_result, right_result):
+ return Expressions.or_(left_result, right_result)
+
+ def predicate(self, pred):
+ bound = pred.bind(self.spec.schema.as_struct(),
case_sensitive=self.case_sensitive)
+
+ if isinstance(bound, BoundPredicate):
+ return self.predicate(bound)
+
+ return bound
+
+
+class InclusiveProjection(BaseProjectionEvaluator):
+
+ def __init__(self, spec, case_sensitive=True):
+ super(InclusiveProjection, self).__init__(spec,
+
case_sensitive=case_sensitive)
+
+ def predicate(self, pred):
+ if isinstance(pred, UnboundPredicate):
+ return super(InclusiveProjection, self).predicate(pred)
+
+ part = self.spec.get_field_by_source_id(pred.ref.field_id)
+
+ if part is None:
+ return self.always_true()
+
+ result = part.transform.project(part.name, pred)
+ if result is not None:
+ return result
+
+ return self.always_true()
+
+
+class StrictProjection(BaseProjectionEvaluator):
+
+ def __init__(self, spec):
+ super(StrictProjection, self).__init__(spec)
+
+ def predicate(self, pred):
+ part = self.spec.get_field_by_source_id(pred.ref.field_id)
+
+ if part is None:
+ return self.always_false()
+
+ result = part.transform.project_strict(part.name, pred)
+
+ if result is not None:
+ return result
+
+ return self.always_false()
diff --git a/python/iceberg/api/expressions/reference.py
b/python/iceberg/api/expressions/reference.py
index 1ceeb3a..d634e34 100644
--- a/python/iceberg/api/expressions/reference.py
+++ b/python/iceberg/api/expressions/reference.py
@@ -29,6 +29,10 @@ class BoundReference(Reference):
self.pos = self.find(field_id, struct)
self._type = struct.fields[self.pos].type
+ @property
+ def type(self):
+ return self._type
+
def __eq__(self, other):
if id(self) == id(other):
return True
diff --git a/python/iceberg/api/expressions/strict_metrics_evaluator.py
b/python/iceberg/api/expressions/strict_metrics_evaluator.py
index 1997bdd..58b7230 100644
--- a/python/iceberg/api/expressions/strict_metrics_evaluator.py
+++ b/python/iceberg/api/expressions/strict_metrics_evaluator.py
@@ -51,13 +51,13 @@ class StrictMetricsEvaluator(object):
self.upper_bounds = None
def eval(self, file):
- if file.record_count <= 0:
+ if file.record_count() <= 0:
return
StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
- self.value_counts = file.value_counts
- self.null_counts = file.null_value_counts
- self.lower_bounds = file.lower_bounds
- self.upper_bounds = file.upper_bounds
+ self.value_counts = file.value_counts()
+ self.null_counts = file.null_value_counts()
+ self.lower_bounds = file.lower_bounds()
+ self.upper_bounds = file.upper_bounds()
return ExpressionVisitors.visit(self.expr, self)
diff --git a/python/iceberg/api/schema.py b/python/iceberg/api/schema.py
index 19a154d..76b5339 100644
--- a/python/iceberg/api/schema.py
+++ b/python/iceberg/api/schema.py
@@ -28,7 +28,7 @@ class Schema(object):
ALL_COLUMNS = "*"
def __init__(self, *argv):
- aliases = dict()
+ aliases = None
if len(argv) == 1 and isinstance(argv[0], (list, tuple)):
columns = argv[0]
elif len(argv) == 2 and isinstance(argv[0], list) and
isinstance(argv[1], dict):
@@ -46,6 +46,7 @@ class Schema(object):
self._id_to_field = None
self._name_to_id = None
+ self._lowercase_name_to_id = None
self._id_to_name = None
def as_struct(self):
@@ -66,25 +67,33 @@ class Schema(object):
if self._name_to_id is None:
self._name_to_id = index_by_name(self.struct)
self._id_to_name = {v: k for k, v in self._name_to_id.items()}
+ self._lowercase_name_to_id = {k.lower(): v for k, v in
self._name_to_id.items()}
+
return self._name_to_id
+ def lazy_lowercase_name_to_id(self):
+ from .types import index_by_name
+ if self._lowercase_name_to_id is None:
+ self._name_to_id = index_by_name(self.struct)
+ self._id_to_name = {v: k for k, v in self._name_to_id.items()}
+ self._lowercase_name_to_id = {k.lower(): v for k, v in
self._name_to_id.items()}
+
+ return self._lowercase_name_to_id
+
def columns(self):
return self.struct.fields
def find_type(self, name):
- if not name:
- raise RuntimeError("Invalid Column Name (empty)")
-
if isinstance(name, int):
field = self.lazy_id_to_field().get(name)
if field:
return field.type
-
- id = self.lazy_name_to_id().get(name)
- if id:
- return self.find_type(id)
-
- raise RuntimeError("Invalid Column (could not find): %s" % name)
+ elif isinstance(name, str):
+ id = self.lazy_name_to_id().get(name)
+ if id:
+ return self.find_type(id)
+ else:
+ raise RuntimeError("Invalid Column (could not find): %s" % name)
def find_field(self, id):
if isinstance(id, int):
@@ -109,25 +118,29 @@ class Schema(object):
if self._id_to_alias:
return self._id_to_alias.get(field_id)
- def select(self, *argv):
+ def select(self, cols):
+ return self._internal_select(True, cols)
+
+ def case_insensitive_select(self, cols):
+ return self._internal_select(False, cols)
+
+ def _internal_select(self, case_sensitive, cols):
from .types import select
- if not(len(argv) == 1 and isinstance(argv[0], list)):
- return self.select(argv)
- if len(argv) == 1:
- names = argv[0]
- if Schema.ALL_COLUMNS in names:
- return self
+ if Schema.ALL_COLUMNS in cols:
+ return self
+
+ selected = set()
+ for name in cols:
+ if case_sensitive:
+ field_id = self.lazy_name_to_id().get(name)
else:
- selected = list()
- for name in names:
- id = self.lazy_name_to_id().get(name)
- if id:
- selected.append(id)
+ field_id = self.lazy_lowercase_name_to_id().get(name.lower())
- return select(self, selected) # noqa
+ if field_id is not None:
+ selected.add(field_id)
- raise RuntimeError("Illegal argument for select %s", argv)
+ return select(self, selected)
def __repr__(self):
return "Schema(%s)" % self.struct.fields
diff --git a/python/iceberg/api/table_scan.py b/python/iceberg/api/table_scan.py
index e457b9a..aa67943 100644
--- a/python/iceberg/api/table_scan.py
+++ b/python/iceberg/api/table_scan.py
@@ -21,20 +21,33 @@ class TableScan(object):
def __init__(self):
raise NotImplementedError()
+ @property
+ def row_filter(self):
+ raise NotImplementedError()
+
def use_snapshot(self, snapshot_id):
raise NotImplementedError()
def as_of_time(self, timestamp_millis):
raise NotImplementedError()
+ def project(self, schema):
+ raise NotImplementedError()
+
def select(self, columns):
raise NotImplementedError()
- def filter(self, expr=None):
+ def filter(self, expr):
raise NotImplementedError()
- def plan_file(self):
+ def plan_files(self):
raise NotImplementedError()
def plan_tasks(self):
raise NotImplementedError()
+
+ def is_case_sensitive(self):
+ raise NotImplementedError()
+
+ def options(self):
+ raise NotImplementedError()
diff --git a/python/iceberg/api/types/check_compatibility.py
b/python/iceberg/api/types/check_compatibility.py
deleted file mode 100644
index e69de29..0000000
diff --git a/python/iceberg/api/types/types.py
b/python/iceberg/api/types/types.py
index 40e1d88..225bdee 100644
--- a/python/iceberg/api/types/types.py
+++ b/python/iceberg/api/types/types.py
@@ -337,7 +337,7 @@ class FixedType(PrimitiveType):
return not self.__eq__(other)
def __repr__(self):
- return "fixed[%s]" % (self.length())
+ return "fixed[%s]" % (self.length)
def __str__(self):
return self.__repr__()
@@ -475,6 +475,7 @@ class StructType(NestedType):
self._fieldList = None
self._fieldsByName = None
+ self._fieldsByLowercaseName = None
self._fieldsById = None
def __eq__(self, other):
@@ -490,16 +491,19 @@ class StructType(NestedType):
@property
def fields(self):
- return self._lazyFieldList()
+ return self._lazy_field_list()
def field(self, name=None, id=None):
if name:
- return self._lazyFieldsByName().get(name)
+ return self._lazy_fields_by_name().get(name)
elif id:
- return self._lazyFieldsById()[id]
+ return self._lazy_fields_by_id()[id]
raise RuntimeError("No valid field info passed in ")
+ def case_insensitive_field(self, name):
+ return self._lazy_fields_by_lowercase_name().get(name)
+
@property
def type_id(self):
return TypeID.STRUCT
@@ -519,27 +523,34 @@ class StructType(NestedType):
def __key(self):
return StructType.__class__, self.fields
- def _lazyFieldList(self):
+ def _lazy_field_list(self):
if self._fieldList is None:
self._fieldList = tuple(self._fields)
return self._fieldList
- def _lazyFieldsByName(self):
+ def _lazy_fields_by_name(self):
+ if self._fieldsByName is None:
+ self.index_fields()
+ return self._fieldsByName
+
+ def _lazy_fields_by_lowercase_name(self):
if self._fieldsByName is None:
- self.indexFields()
+ self.index_fields()
return self._fieldsByName
- def _lazyFieldsById(self):
+ def _lazy_fields_by_id(self):
if self._fieldsById is None:
- self.indexFields()
+ self.index_fields()
return self._fieldsById
- def indexFields(self):
+ def index_fields(self):
self._fieldsByName = dict()
+ self._fieldsByLowercaseName = dict()
self._fieldsById = dict()
for field in self.fields:
self._fieldsByName[field.name] = field
+ self._fieldsByLowercaseName[field.name.lower()] = field
self._fieldsById[field.id] = field
@@ -668,7 +679,7 @@ class MapType(NestedType):
return self.value_field
def fields(self):
- return self._lazyFieldList()
+ return self._lazy_field_list()
def key_id(self):
return self.key_field.field_id
@@ -702,5 +713,5 @@ class MapType(NestedType):
def __key(self):
return MapType.__class__, self.key_field, self.value_field
- def _lazyFieldList(self):
+ def _lazy_field_list(self):
return tuple(self.key_field, self.value_field)
diff --git a/python/tests/api/expressions/__init__.py
b/python/tests/api/expressions/__init__.py
index 2456923..13a8339 100644
--- a/python/tests/api/expressions/__init__.py
+++ b/python/tests/api/expressions/__init__.py
@@ -14,4 +14,3 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-
diff --git a/python/tests/api/expressions/conftest.py
b/python/tests/api/expressions/conftest.py
index 54039b2..fe1d982 100644
--- a/python/tests/api/expressions/conftest.py
+++ b/python/tests/api/expressions/conftest.py
@@ -18,9 +18,10 @@
from decimal import Decimal
import io
import pickle
+import time
import uuid
-from iceberg.api import DataFile
+from iceberg.api import DataFile, ManifestFile, PartitionFieldSummary,
PartitionSpec
from iceberg.api.expressions import (BoundPredicate,
Expressions,
ExpressionVisitors,
@@ -96,17 +97,83 @@ class TestDataFile(DataFile):
def __init__(self, path, partition, record_count, value_counts=None,
null_value_counts=None,
lower_bounds=None, upper_bounds=None):
+ self._path = path
+ self._partition = partition
+ self._record_count = record_count
+ self._value_counts = value_counts
+ self._null_value_counts = null_value_counts
+ self._lower_bounds = lower_bounds
+ self._upper_bounds = upper_bounds
+ self._file_size_in_bytes = 0
+ self._block_size_in_bytes = 0
+ self._file_ordinal = None
+ self._column_sizes = None
+
+ def path(self):
+ return self._path
+
+ def partition(self):
+ return self._partition
+
+ def record_count(self):
+ return self._record_count
+
+ def value_counts(self):
+ return self._value_counts
+
+ def null_value_counts(self):
+ return self._null_value_counts
+
+ def lower_bounds(self):
+ return self._lower_bounds
+
+ def upper_bounds(self):
+ return self._upper_bounds
+
+ def file_size_in_bytes(self):
+ return self._file_size_in_bytes
+
+ def file_ordinal(self):
+ return self._file_ordinal
+
+ def column_sizes(self):
+ return self._column_sizes
+
+ def copy(self):
+ return self
+
+
+class TestManifestFile(ManifestFile):
+
+ def __init__(self, path, length, spec_id, snapshot_id, added_files,
existing_files, deleted_files, partitions):
self.path = path
- self.partition = partition
- self.record_count = record_count
- self.value_counts = value_counts
- self.null_value_counts = null_value_counts
- self.lower_bounds = lower_bounds
- self.upper_bounds = upper_bounds
- self.file_size_in_bytes = 0
- self.block_size_in_bytes = 0
- self.file_ordinal = None
- self.column_size = None
+ self.length = length
+ self.spec_id = spec_id
+ self.snapshot_id = snapshot_id
+ self.added_files = added_files
+ self.existing_files = existing_files
+ self.deleted_files = deleted_files
+ self.partitions = partitions
+
+ def copy(self):
+ return self
+
+
+class TestFieldSummary(PartitionFieldSummary):
+
+ def __init__(self, contains_null, lower_bound, upper_bound):
+ self._contains_null = contains_null
+ self._lower_bound = lower_bound
+ self._upper_bound = upper_bound
+
+ def contains_null(self):
+ return self._contains_null
+
+ def lower_bound(self):
+ return self._lower_bound
+
+ def upper_bound(self):
+ return self._upper_bound
def copy(self):
return self
@@ -273,6 +340,18 @@ def not_eq_rewrite(request):
@pytest.fixture(scope="session",
+ params=[Expressions.equal("ID", 5),
+ Expressions.equal("ID", 29),
+ Expressions.equal("ID", 30),
+ Expressions.equal("ID", 75),
+ Expressions.equal("ID", 79),
+ Expressions.equal("ID", 80),
+ Expressions.equal("ID", 85)])
+def not_eq_uc(request):
+ yield request.param
+
+
[email protected](scope="session",
params=[Literal.of(False),
Literal.of(34),
Literal.of(35),
@@ -305,3 +384,42 @@ def type_val_tuples(request):
(DecimalType.of(9, 4), "34.5600")])
def float_type_val_tuples(request):
yield request.param
+
+
[email protected](scope="session")
+def inc_man_spec():
+ inc_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
+ NestedField.optional(4, "all_nulls", StringType.get()),
+ NestedField.optional(5, "some_nulls",
StringType.get()),
+ NestedField.optional(6, "no_nulls", StringType.get()))
+ return (PartitionSpec.builder_for(inc_schema)
+ .with_spec_id(0)
+ .identity("id")
+ .identity("all_nulls")
+ .identity("some_nulls")
+ .identity("no_nulls")
+ .build()
+ )
+
+
[email protected](scope="session")
+def inc_man_file():
+ return TestManifestFile("manifest-list.avro", 1024, 0, int(time.time() *
1000), 5, 10, 0,
+ (TestFieldSummary(False,
+
Conversions.to_byte_buffer(IntegerType.get(), 30),
+
Conversions.to_byte_buffer(IntegerType.get(), 79)),
+ TestFieldSummary(True,
+ None,
+ None),
+ TestFieldSummary(True,
+
Conversions.to_byte_buffer(StringType.get(), 'a'),
+
Conversions.to_byte_buffer(StringType.get(), 'z')),
+ TestFieldSummary(False,
+
Conversions.to_byte_buffer(StringType.get(), 'a'),
+
Conversions.to_byte_buffer(StringType.get(), 'z'))
+ ))
+
+
[email protected](scope="session")
+def inc_man_file_ns():
+ return TestManifestFile("manifest-list.avro", 1024, 0, int(time.time() *
1000), None, None, None, None)
diff --git a/python/tests/api/expressions/test_evaluator.py
b/python/tests/api/expressions/test_evaluator.py
index ec3d08f..b5820f4 100644
--- a/python/tests/api/expressions/test_evaluator.py
+++ b/python/tests/api/expressions/test_evaluator.py
@@ -15,11 +15,14 @@
# specific language governing permissions and limitations
# under the License.
+
import iceberg.api.expressions as exp
from iceberg.api.types import (IntegerType,
NestedField,
StringType,
StructType)
+from iceberg.exceptions import ValidationException
+from pytest import raises
STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()),
NestedField.required(14, "y", IntegerType.get()),
@@ -123,6 +126,21 @@ def test_not(row_of):
assert evaluator.eval(row_of((8,)))
+def test_case_insensitive_not(row_of):
+ evaluator = exp.evaluator.Evaluator(STRUCT,
+
exp.expressions.Expressions.not_(exp.expressions.Expressions.equal("X", 7)),
+ case_sensitive=False)
+ assert not evaluator.eval(row_of((7,)))
+ assert evaluator.eval(row_of((8,)))
+
+
+def test_case_sensitive_not():
+ with raises(ValidationException):
+ exp.evaluator.Evaluator(STRUCT,
+
exp.expressions.Expressions.not_(exp.expressions.Expressions.equal("X", 7)),
+ case_sensitive=True)
+
+
def test_char_seq_value(row_of):
struct = StructType.of([NestedField.required(34, "s", StringType.get())])
evaluator = exp.evaluator.Evaluator(struct,
exp.expressions.Expressions.equal("s", "abc"))
diff --git a/python/tests/api/expressions/test_expression_binding.py
b/python/tests/api/expressions/test_expression_binding.py
index 47f6fea..f6a4d98 100644
--- a/python/tests/api/expressions/test_expression_binding.py
+++ b/python/tests/api/expressions/test_expression_binding.py
@@ -53,6 +53,18 @@ def test_single_reference(assert_all_bound):
Binder.bind(STRUCT, expr))
+def test_case_insensitive_reference(assert_all_bound):
+ expr = Expressions.not_(Expressions.equal("X", 7))
+ assert_all_bound("Single reference",
+ Binder.bind(STRUCT, expr, case_sensitive=False))
+
+
+def test_case_sensitive_reference():
+ with raises(ice_ex.ValidationException):
+ expr = Expressions.not_(Expressions.equal("X", 7))
+ Binder.bind(STRUCT, expr, case_sensitive=True)
+
+
def test_multiple_references(assert_all_bound):
expr = Expressions.or_(Expressions.and_(Expressions.equal("x", 7),
Expressions.less_than("y", 100)),
diff --git a/python/tests/api/expressions/test_inclusive_metrics_evaluator.py
b/python/tests/api/expressions/test_inclusive_metrics_evaluator.py
index fcbef1e..e9f0473 100644
--- a/python/tests/api/expressions/test_inclusive_metrics_evaluator.py
+++ b/python/tests/api/expressions/test_inclusive_metrics_evaluator.py
@@ -17,6 +17,7 @@
from iceberg.api.expressions import (Expressions,
InclusiveMetricsEvaluator)
+from iceberg.exceptions import ValidationException
from pytest import raises
@@ -116,3 +117,14 @@ def test_integer_not_eq(schema, file, not_eq):
def test_not_eq_rewritten(schema, file, not_eq_rewrite):
assert InclusiveMetricsEvaluator(schema,
Expressions.not_(not_eq_rewrite)).eval(file)
+
+
+def test_case_insensitive_int_not_eq_rewritten(schema, file, not_eq_uc):
+ assert InclusiveMetricsEvaluator(schema, Expressions.not_(not_eq_uc),
+ case_sensitive=False).eval(file)
+
+
+def test_case_sensitive_int_not_eq_rewritten(schema, file, not_eq_uc):
+ with raises(ValidationException):
+ assert InclusiveMetricsEvaluator(schema, Expressions.not_(not_eq_uc),
+ case_sensitive=True).eval(file)