[incubator-iceberg] branch master updated: Backporting case-sensitivity flag to expression module (#196)

dweeks Fri, 31 May 2019 12:27:33 -0700

This is an automated email from the ASF dual-hosted git repository.

dweeks pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git



The following commit(s) were added to refs/heads/master by this push:
     new b847800  Backporting case-sensitivity flag to expression module (#196)
b847800 is described below

commit b847800ff7df9737ea99ab3d54c04e4188bd5a0a
Author: TGooch44 <[email protected]>
AuthorDate: Fri May 31 12:27:18 2019 -0700

    Backporting case-sensitivity flag to expression module (#196)
    
    * [WIP] Initial python implementation commit
    
    * Updating PR per comments from @xhochy
    
    * Backporting case-insensitve expressions
---
 python/iceberg/api/exceptions/__init__.py          |  23 ++--
 python/iceberg/api/exceptions/already_exists.py    |  19 ---
 .../iceberg/api/exceptions/validation_exception.py |  26 ----
 python/iceberg/api/expressions/__init__.py         |   8 ++
 python/iceberg/api/expressions/binder.py           |  13 +-
 python/iceberg/api/expressions/evaluator.py        |   4 +-
 .../api/expressions/inclusive_metrics_evaluator.py |  15 +--
 python/iceberg/api/expressions/predicate.py        |   8 +-
 python/iceberg/api/expressions/projections.py      | 114 +++++++++++++++++
 python/iceberg/api/expressions/reference.py        |   4 +
 .../api/expressions/strict_metrics_evaluator.py    |  10 +-
 python/iceberg/api/schema.py                       |  61 +++++----
 python/iceberg/api/table_scan.py                   |  17 ++-
 python/iceberg/api/types/check_compatibility.py    |   0
 python/iceberg/api/types/types.py                  |  35 ++++--
 python/tests/api/expressions/__init__.py           |   1 -
 python/tests/api/expressions/conftest.py           | 140 +++++++++++++++++++--
 python/tests/api/expressions/test_evaluator.py     |  18 +++
 .../api/expressions/test_expression_binding.py     |  12 ++
 .../test_inclusive_metrics_evaluator.py            |  12 ++
 20 files changed, 415 insertions(+), 125 deletions(-)

diff --git a/python/iceberg/api/exceptions/__init__.py 
b/python/iceberg/api/exceptions/__init__.py
index d0d7056..e6810ef 100644
--- a/python/iceberg/api/exceptions/__init__.py
+++ b/python/iceberg/api/exceptions/__init__.py
@@ -6,14 +6,21 @@
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#   http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
+__all__ = ["AlreadyExistsException",
+           "CommitFailedException",
+           "NoSuchTableException",
+           "ValidationException"]
 
-from .already_exists import AlreadyExists # noqa
-from .validation_exception import ValidationException # noqa
\ No newline at end of file
+from .exceptions import (AlreadyExistsException,
+                         CommitFailedException,
+                         NoSuchTableException,
+                         ValidationException)
diff --git a/python/iceberg/api/exceptions/already_exists.py 
b/python/iceberg/api/exceptions/already_exists.py
deleted file mode 100644
index e92de04..0000000
--- a/python/iceberg/api/exceptions/already_exists.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class AlreadyExists(Exception):
-    pass
diff --git a/python/iceberg/api/exceptions/validation_exception.py 
b/python/iceberg/api/exceptions/validation_exception.py
deleted file mode 100644
index 7261e6b..0000000
--- a/python/iceberg/api/exceptions/validation_exception.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class ValidationException(RuntimeError):
-
-    @staticmethod
-    def check(test, message, args):
-        if not test:
-            raise ValidationException(message, args)
-
-    def __init__(self, message, args):
-        super(ValidationException, self).__init__(message % args)
diff --git a/python/iceberg/api/expressions/__init__.py 
b/python/iceberg/api/expressions/__init__.py
index 126bdd7..fdc5448 100644
--- a/python/iceberg/api/expressions/__init__.py
+++ b/python/iceberg/api/expressions/__init__.py
@@ -36,7 +36,9 @@ __all__ = ["ABOVE_MAX",
            "FalseExp",
            "FixedLiteral",
            "FloatLiteral",
+           "inclusive",
            "InclusiveMetricsEvaluator",
+           "InclusiveProjection",
            "IntegerLiteral",
            "JAVA_MAX_FLOAT",
            "JAVA_MAX_INT",
@@ -50,7 +52,9 @@ __all__ = ["ABOVE_MAX",
            "Or",
            "Predicate",
            "Reference",
+           "strict",
            "StrictMetricsEvaluator",
+           "StrictProjection",
            "StringLiteral",
            "TRUE",
            "TrueExp",
@@ -91,6 +95,10 @@ from .literals import (ABOVE_MAX,
 from .predicate import (BoundPredicate,
                         Predicate,
                         UnboundPredicate)
+from .projections import (inclusive,
+                          InclusiveProjection,
+                          strict,
+                          StrictProjection)
 from .reference import (BoundReference,
                         NamedReference,
                         Reference)
diff --git a/python/iceberg/api/expressions/binder.py 
b/python/iceberg/api/expressions/binder.py
index ad5921c..3be2d46 100644
--- a/python/iceberg/api/expressions/binder.py
+++ b/python/iceberg/api/expressions/binder.py
@@ -22,16 +22,16 @@ from .predicate import BoundPredicate
 class Binder(object):
 
     @staticmethod
-    def bind(struct, expr):
-        return ExpressionVisitors.visit(expr, Binder.BindVisitor(struct))
+    def bind(struct, expr, case_sensitive=True):
+        return ExpressionVisitors.visit(expr, Binder.BindVisitor(struct, 
case_sensitive))
 
     @staticmethod
-    def bound_references(struct, exprs):
+    def bound_references(struct, exprs, case_sensitive=True):
         if exprs is None:
             return set()
         visitor = Binder.ReferenceVisitor()
         for expr in exprs:
-            ExpressionVisitors.visit(Binder.bind(struct, expr), visitor)
+            ExpressionVisitors.visit(Binder.bind(struct, expr, 
case_sensitive), visitor)
 
         return visitor.references
 
@@ -40,8 +40,9 @@ class Binder(object):
 
     class BindVisitor(ExpressionVisitors.ExpressionVisitor):
 
-        def __init__(self, struct):
+        def __init__(self, struct, case_sensitive=True):
             self.struct = struct
+            self.case_sensitive = case_sensitive
 
         def always_true(self):
             return Expressions.always_true()
@@ -62,7 +63,7 @@ class Binder(object):
             if isinstance(pred, BoundPredicate):
                 raise RuntimeError("Found already bound predicate: 
{}".format(pred))
 
-            return pred.bind(self.struct)
+            return pred.bind(self.struct, self.case_sensitive)
 
     class ReferenceVisitor(ExpressionVisitors.ExpressionVisitor):
 
diff --git a/python/iceberg/api/expressions/evaluator.py 
b/python/iceberg/api/expressions/evaluator.py
index d4bf269..340f352 100644
--- a/python/iceberg/api/expressions/evaluator.py
+++ b/python/iceberg/api/expressions/evaluator.py
@@ -21,8 +21,8 @@ from .expressions import ExpressionVisitors
 
 class Evaluator(object):
 
-    def __init__(self, struct, unbound):
-        self.expr = Binder.bind(struct, unbound)
+    def __init__(self, struct, unbound, case_sensitive=True):
+        self.expr = Binder.bind(struct, unbound, case_sensitive)
         self.visitors = None
 
     def visitor(self):
diff --git a/python/iceberg/api/expressions/inclusive_metrics_evaluator.py 
b/python/iceberg/api/expressions/inclusive_metrics_evaluator.py
index 5b943e2..3134831 100644
--- a/python/iceberg/api/expressions/inclusive_metrics_evaluator.py
+++ b/python/iceberg/api/expressions/inclusive_metrics_evaluator.py
@@ -22,10 +22,11 @@ from ..types import Conversions
 
 class InclusiveMetricsEvaluator(object):
 
-    def __init__(self, schema, unbound):
+    def __init__(self, schema, unbound, case_sensitive=True):
         self.schema = schema
         self.struct = schema.as_struct()
-        self.expr = Binder.bind(self.struct, Expressions.rewrite_not(unbound))
+        self.case_sensitive = case_sensitive
+        self.expr = Binder.bind(self.struct, Expressions.rewrite_not(unbound), 
case_sensitive)
         self._visitors = None
 
     def _visitor(self):
@@ -52,13 +53,13 @@ class 
MetricsEvalVisitor(ExpressionVisitors.BoundExpressionVisitor):
         self.struct = struct
 
     def eval(self, file):
-        if file.record_count <= 0:
+        if file.record_count() <= 0:
             return MetricsEvalVisitor.ROWS_CANNOT_MATCH
 
-        self.value_counts = file.value_counts
-        self.null_counts = file.null_value_counts
-        self.lower_bounds = file.lower_bounds
-        self.upper_bounds = file.upper_bounds
+        self.value_counts = file.value_counts()
+        self.null_counts = file.null_value_counts()
+        self.lower_bounds = file.lower_bounds()
+        self.upper_bounds = file.upper_bounds()
 
         return ExpressionVisitors.visit(self.expr, self)
 
diff --git a/python/iceberg/api/expressions/predicate.py 
b/python/iceberg/api/expressions/predicate.py
index 55625dc..4558dc9 100644
--- a/python/iceberg/api/expressions/predicate.py
+++ b/python/iceberg/api/expressions/predicate.py
@@ -93,8 +93,12 @@ class UnboundPredicate(Predicate):
     def negate(self):
         return UnboundPredicate(self.op.negate(), self.ref, self.lit)
 
-    def bind(self, struct):  # noqa: C901
-        field = struct.field(self.ref.name)
+    def bind(self, struct, case_sensitive=True):  # noqa: C901
+        if case_sensitive:
+            field = struct.field(self.ref.name)
+        else:
+            field = struct.case_insensitive_field(self.ref.name.lower())
+
         ValidationException.check(field is not None,
                                   "Cannot find field '%s' in struct %s", 
(self.ref.name, struct))
 
diff --git a/python/iceberg/api/expressions/projections.py 
b/python/iceberg/api/expressions/projections.py
new file mode 100644
index 0000000..1a4a6bd
--- /dev/null
+++ b/python/iceberg/api/expressions/projections.py
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from .expressions import Expressions, ExpressionVisitors, RewriteNot
+from .predicate import BoundPredicate, UnboundPredicate
+
+
+def inclusive(spec, case_sensitive=True):
+    return InclusiveProjection(spec, case_sensitive)
+
+
+def strict(spec):
+    return StrictProjection(spec)
+
+
+class ProjectionEvaluator(ExpressionVisitors.ExpressionVisitor):
+
+    def project(self, expr):
+        raise NotImplementedError()
+
+
+class BaseProjectionEvaluator(ProjectionEvaluator):
+
+    def __init__(self, spec, case_sensitive=True):
+        self.spec = spec
+        self.case_sensitive = case_sensitive
+
+    def project(self, expr):
+        # projections assume that there are no NOT nodes in the expression 
tree. to ensure that this
+        # is the case, the expression is rewritten to push all NOT nodes down 
to the expression
+        # leaf nodes.
+        # this is necessary to ensure that the default expression returned 
when a predicate can't be
+        # projected is correct.
+        #
+        return ExpressionVisitors.visit(ExpressionVisitors.visit(expr, 
RewriteNot.get()), self)
+
+    def always_true(self):
+        return Expressions.always_true()
+
+    def always_false(self):
+        return Expressions.always_false()
+
+    def not_(self, result):
+        raise RuntimeError("[BUG] project called on expression with a not")
+
+    def and_(self, left_result, right_result):
+        return Expressions.and_(left_result, right_result)
+
+    def or_(self, left_result, right_result):
+        return Expressions.or_(left_result, right_result)
+
+    def predicate(self, pred):
+        bound = pred.bind(self.spec.schema.as_struct(), 
case_sensitive=self.case_sensitive)
+
+        if isinstance(bound, BoundPredicate):
+            return self.predicate(bound)
+
+        return bound
+
+
+class InclusiveProjection(BaseProjectionEvaluator):
+
+    def __init__(self, spec, case_sensitive=True):
+        super(InclusiveProjection, self).__init__(spec,
+                                                  
case_sensitive=case_sensitive)
+
+    def predicate(self, pred):
+        if isinstance(pred, UnboundPredicate):
+            return super(InclusiveProjection, self).predicate(pred)
+
+        part = self.spec.get_field_by_source_id(pred.ref.field_id)
+
+        if part is None:
+            return self.always_true()
+
+        result = part.transform.project(part.name, pred)
+        if result is not None:
+            return result
+
+        return self.always_true()
+
+
+class StrictProjection(BaseProjectionEvaluator):
+
+    def __init__(self, spec):
+        super(StrictProjection, self).__init__(spec)
+
+    def predicate(self, pred):
+        part = self.spec.get_field_by_source_id(pred.ref.field_id)
+
+        if part is None:
+            return self.always_false()
+
+        result = part.transform.project_strict(part.name, pred)
+
+        if result is not None:
+            return result
+
+        return self.always_false()
diff --git a/python/iceberg/api/expressions/reference.py 
b/python/iceberg/api/expressions/reference.py
index 1ceeb3a..d634e34 100644
--- a/python/iceberg/api/expressions/reference.py
+++ b/python/iceberg/api/expressions/reference.py
@@ -29,6 +29,10 @@ class BoundReference(Reference):
         self.pos = self.find(field_id, struct)
         self._type = struct.fields[self.pos].type
 
+    @property
+    def type(self):
+        return self._type
+
     def __eq__(self, other):
         if id(self) == id(other):
             return True
diff --git a/python/iceberg/api/expressions/strict_metrics_evaluator.py 
b/python/iceberg/api/expressions/strict_metrics_evaluator.py
index 1997bdd..58b7230 100644
--- a/python/iceberg/api/expressions/strict_metrics_evaluator.py
+++ b/python/iceberg/api/expressions/strict_metrics_evaluator.py
@@ -51,13 +51,13 @@ class StrictMetricsEvaluator(object):
             self.upper_bounds = None
 
         def eval(self, file):
-            if file.record_count <= 0:
+            if file.record_count() <= 0:
                 return 
StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
 
-            self.value_counts = file.value_counts
-            self.null_counts = file.null_value_counts
-            self.lower_bounds = file.lower_bounds
-            self.upper_bounds = file.upper_bounds
+            self.value_counts = file.value_counts()
+            self.null_counts = file.null_value_counts()
+            self.lower_bounds = file.lower_bounds()
+            self.upper_bounds = file.upper_bounds()
 
             return ExpressionVisitors.visit(self.expr, self)
 
diff --git a/python/iceberg/api/schema.py b/python/iceberg/api/schema.py
index 19a154d..76b5339 100644
--- a/python/iceberg/api/schema.py
+++ b/python/iceberg/api/schema.py
@@ -28,7 +28,7 @@ class Schema(object):
     ALL_COLUMNS = "*"
 
     def __init__(self, *argv):
-        aliases = dict()
+        aliases = None
         if len(argv) == 1 and isinstance(argv[0], (list, tuple)):
             columns = argv[0]
         elif len(argv) == 2 and isinstance(argv[0], list) and 
isinstance(argv[1], dict):
@@ -46,6 +46,7 @@ class Schema(object):
 
         self._id_to_field = None
         self._name_to_id = None
+        self._lowercase_name_to_id = None
         self._id_to_name = None
 
     def as_struct(self):
@@ -66,25 +67,33 @@ class Schema(object):
         if self._name_to_id is None:
             self._name_to_id = index_by_name(self.struct)
             self._id_to_name = {v: k for k, v in self._name_to_id.items()}
+            self._lowercase_name_to_id = {k.lower(): v for k, v in 
self._name_to_id.items()}
+
         return self._name_to_id
 
+    def lazy_lowercase_name_to_id(self):
+        from .types import index_by_name
+        if self._lowercase_name_to_id is None:
+            self._name_to_id = index_by_name(self.struct)
+            self._id_to_name = {v: k for k, v in self._name_to_id.items()}
+            self._lowercase_name_to_id = {k.lower(): v for k, v in 
self._name_to_id.items()}
+
+        return self._lowercase_name_to_id
+
     def columns(self):
         return self.struct.fields
 
     def find_type(self, name):
-        if not name:
-            raise RuntimeError("Invalid Column Name (empty)")
-
         if isinstance(name, int):
             field = self.lazy_id_to_field().get(name)
             if field:
                 return field.type
-
-        id = self.lazy_name_to_id().get(name)
-        if id:
-            return self.find_type(id)
-
-        raise RuntimeError("Invalid Column (could not find): %s" % name)
+        elif isinstance(name, str):
+            id = self.lazy_name_to_id().get(name)
+            if id:
+                return self.find_type(id)
+        else:
+            raise RuntimeError("Invalid Column (could not find): %s" % name)
 
     def find_field(self, id):
         if isinstance(id, int):
@@ -109,25 +118,29 @@ class Schema(object):
         if self._id_to_alias:
             return self._id_to_alias.get(field_id)
 
-    def select(self, *argv):
+    def select(self, cols):
+        return self._internal_select(True, cols)
+
+    def case_insensitive_select(self, cols):
+        return self._internal_select(False, cols)
+
+    def _internal_select(self, case_sensitive, cols):
         from .types import select
-        if not(len(argv) == 1 and isinstance(argv[0], list)):
-            return self.select(argv)
 
-        if len(argv) == 1:
-            names = argv[0]
-            if Schema.ALL_COLUMNS in names:
-                return self
+        if Schema.ALL_COLUMNS in cols:
+            return self
+
+        selected = set()
+        for name in cols:
+            if case_sensitive:
+                field_id = self.lazy_name_to_id().get(name)
             else:
-                selected = list()
-                for name in names:
-                    id = self.lazy_name_to_id().get(name)
-                    if id:
-                        selected.append(id)
+                field_id = self.lazy_lowercase_name_to_id().get(name.lower())
 
-                return select(self, selected) # noqa
+            if field_id is not None:
+                selected.add(field_id)
 
-        raise RuntimeError("Illegal argument for select %s", argv)
+        return select(self, selected)
 
     def __repr__(self):
         return "Schema(%s)" % self.struct.fields
diff --git a/python/iceberg/api/table_scan.py b/python/iceberg/api/table_scan.py
index e457b9a..aa67943 100644
--- a/python/iceberg/api/table_scan.py
+++ b/python/iceberg/api/table_scan.py
@@ -21,20 +21,33 @@ class TableScan(object):
     def __init__(self):
         raise NotImplementedError()
 
+    @property
+    def row_filter(self):
+        raise NotImplementedError()
+
     def use_snapshot(self, snapshot_id):
         raise NotImplementedError()
 
     def as_of_time(self, timestamp_millis):
         raise NotImplementedError()
 
+    def project(self, schema):
+        raise NotImplementedError()
+
     def select(self, columns):
         raise NotImplementedError()
 
-    def filter(self, expr=None):
+    def filter(self, expr):
         raise NotImplementedError()
 
-    def plan_file(self):
+    def plan_files(self):
         raise NotImplementedError()
 
     def plan_tasks(self):
         raise NotImplementedError()
+
+    def is_case_sensitive(self):
+        raise NotImplementedError()
+
+    def options(self):
+        raise NotImplementedError()
diff --git a/python/iceberg/api/types/check_compatibility.py 
b/python/iceberg/api/types/check_compatibility.py
deleted file mode 100644
index e69de29..0000000
diff --git a/python/iceberg/api/types/types.py 
b/python/iceberg/api/types/types.py
index 40e1d88..225bdee 100644
--- a/python/iceberg/api/types/types.py
+++ b/python/iceberg/api/types/types.py
@@ -337,7 +337,7 @@ class FixedType(PrimitiveType):
         return not self.__eq__(other)
 
     def __repr__(self):
-        return "fixed[%s]" % (self.length())
+        return "fixed[%s]" % (self.length)
 
     def __str__(self):
         return self.__repr__()
@@ -475,6 +475,7 @@ class StructType(NestedType):
 
         self._fieldList = None
         self._fieldsByName = None
+        self._fieldsByLowercaseName = None
         self._fieldsById = None
 
     def __eq__(self, other):
@@ -490,16 +491,19 @@ class StructType(NestedType):
 
     @property
     def fields(self):
-        return self._lazyFieldList()
+        return self._lazy_field_list()
 
     def field(self, name=None, id=None):
         if name:
-            return self._lazyFieldsByName().get(name)
+            return self._lazy_fields_by_name().get(name)
         elif id:
-            return self._lazyFieldsById()[id]
+            return self._lazy_fields_by_id()[id]
 
         raise RuntimeError("No valid field info passed in ")
 
+    def case_insensitive_field(self, name):
+        return self._lazy_fields_by_lowercase_name().get(name)
+
     @property
     def type_id(self):
         return TypeID.STRUCT
@@ -519,27 +523,34 @@ class StructType(NestedType):
     def __key(self):
         return StructType.__class__, self.fields
 
-    def _lazyFieldList(self):
+    def _lazy_field_list(self):
         if self._fieldList is None:
             self._fieldList = tuple(self._fields)
         return self._fieldList
 
-    def _lazyFieldsByName(self):
+    def _lazy_fields_by_name(self):
+        if self._fieldsByName is None:
+            self.index_fields()
+        return self._fieldsByName
+
+    def _lazy_fields_by_lowercase_name(self):
         if self._fieldsByName is None:
-            self.indexFields()
+            self.index_fields()
         return self._fieldsByName
 
-    def _lazyFieldsById(self):
+    def _lazy_fields_by_id(self):
         if self._fieldsById is None:
-            self.indexFields()
+            self.index_fields()
         return self._fieldsById
 
-    def indexFields(self):
+    def index_fields(self):
         self._fieldsByName = dict()
+        self._fieldsByLowercaseName = dict()
         self._fieldsById = dict()
 
         for field in self.fields:
             self._fieldsByName[field.name] = field
+            self._fieldsByLowercaseName[field.name.lower()] = field
             self._fieldsById[field.id] = field
 
 
@@ -668,7 +679,7 @@ class MapType(NestedType):
             return self.value_field
 
     def fields(self):
-        return self._lazyFieldList()
+        return self._lazy_field_list()
 
     def key_id(self):
         return self.key_field.field_id
@@ -702,5 +713,5 @@ class MapType(NestedType):
     def __key(self):
         return MapType.__class__, self.key_field, self.value_field
 
-    def _lazyFieldList(self):
+    def _lazy_field_list(self):
         return tuple(self.key_field, self.value_field)
diff --git a/python/tests/api/expressions/__init__.py 
b/python/tests/api/expressions/__init__.py
index 2456923..13a8339 100644
--- a/python/tests/api/expressions/__init__.py
+++ b/python/tests/api/expressions/__init__.py
@@ -14,4 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
diff --git a/python/tests/api/expressions/conftest.py 
b/python/tests/api/expressions/conftest.py
index 54039b2..fe1d982 100644
--- a/python/tests/api/expressions/conftest.py
+++ b/python/tests/api/expressions/conftest.py
@@ -18,9 +18,10 @@
 from decimal import Decimal
 import io
 import pickle
+import time
 import uuid
 
-from iceberg.api import DataFile
+from iceberg.api import DataFile, ManifestFile, PartitionFieldSummary, 
PartitionSpec
 from iceberg.api.expressions import (BoundPredicate,
                                      Expressions,
                                      ExpressionVisitors,
@@ -96,17 +97,83 @@ class TestDataFile(DataFile):
 
     def __init__(self, path, partition, record_count, value_counts=None, 
null_value_counts=None,
                  lower_bounds=None, upper_bounds=None):
+        self._path = path
+        self._partition = partition
+        self._record_count = record_count
+        self._value_counts = value_counts
+        self._null_value_counts = null_value_counts
+        self._lower_bounds = lower_bounds
+        self._upper_bounds = upper_bounds
+        self._file_size_in_bytes = 0
+        self._block_size_in_bytes = 0
+        self._file_ordinal = None
+        self._column_sizes = None
+
+    def path(self):
+        return self._path
+
+    def partition(self):
+        return self._partition
+
+    def record_count(self):
+        return self._record_count
+
+    def value_counts(self):
+        return self._value_counts
+
+    def null_value_counts(self):
+        return self._null_value_counts
+
+    def lower_bounds(self):
+        return self._lower_bounds
+
+    def upper_bounds(self):
+        return self._upper_bounds
+
+    def file_size_in_bytes(self):
+        return self._file_size_in_bytes
+
+    def file_ordinal(self):
+        return self._file_ordinal
+
+    def column_sizes(self):
+        return self._column_sizes
+
+    def copy(self):
+        return self
+
+
+class TestManifestFile(ManifestFile):
+
+    def __init__(self, path, length, spec_id, snapshot_id, added_files, 
existing_files, deleted_files, partitions):
         self.path = path
-        self.partition = partition
-        self.record_count = record_count
-        self.value_counts = value_counts
-        self.null_value_counts = null_value_counts
-        self.lower_bounds = lower_bounds
-        self.upper_bounds = upper_bounds
-        self.file_size_in_bytes = 0
-        self.block_size_in_bytes = 0
-        self.file_ordinal = None
-        self.column_size = None
+        self.length = length
+        self.spec_id = spec_id
+        self.snapshot_id = snapshot_id
+        self.added_files = added_files
+        self.existing_files = existing_files
+        self.deleted_files = deleted_files
+        self.partitions = partitions
+
+    def copy(self):
+        return self
+
+
+class TestFieldSummary(PartitionFieldSummary):
+
+    def __init__(self, contains_null, lower_bound, upper_bound):
+        self._contains_null = contains_null
+        self._lower_bound = lower_bound
+        self._upper_bound = upper_bound
+
+    def contains_null(self):
+        return self._contains_null
+
+    def lower_bound(self):
+        return self._lower_bound
+
+    def upper_bound(self):
+        return self._upper_bound
 
     def copy(self):
         return self
@@ -273,6 +340,18 @@ def not_eq_rewrite(request):
 
 
 @pytest.fixture(scope="session",
+                params=[Expressions.equal("ID", 5),
+                        Expressions.equal("ID", 29),
+                        Expressions.equal("ID", 30),
+                        Expressions.equal("ID", 75),
+                        Expressions.equal("ID", 79),
+                        Expressions.equal("ID", 80),
+                        Expressions.equal("ID", 85)])
+def not_eq_uc(request):
+    yield request.param
+
+
[email protected](scope="session",
                 params=[Literal.of(False),
                         Literal.of(34),
                         Literal.of(35),
@@ -305,3 +384,42 @@ def type_val_tuples(request):
                         (DecimalType.of(9, 4), "34.5600")])
 def float_type_val_tuples(request):
     yield request.param
+
+
[email protected](scope="session")
+def inc_man_spec():
+    inc_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
+                        NestedField.optional(4, "all_nulls", StringType.get()),
+                        NestedField.optional(5, "some_nulls", 
StringType.get()),
+                        NestedField.optional(6, "no_nulls", StringType.get()))
+    return (PartitionSpec.builder_for(inc_schema)
+            .with_spec_id(0)
+            .identity("id")
+            .identity("all_nulls")
+            .identity("some_nulls")
+            .identity("no_nulls")
+            .build()
+            )
+
+
[email protected](scope="session")
+def inc_man_file():
+    return TestManifestFile("manifest-list.avro", 1024, 0, int(time.time() * 
1000), 5, 10, 0,
+                            (TestFieldSummary(False,
+                                              
Conversions.to_byte_buffer(IntegerType.get(), 30),
+                                              
Conversions.to_byte_buffer(IntegerType.get(), 79)),
+                             TestFieldSummary(True,
+                                              None,
+                                              None),
+                             TestFieldSummary(True,
+                                              
Conversions.to_byte_buffer(StringType.get(), 'a'),
+                                              
Conversions.to_byte_buffer(StringType.get(), 'z')),
+                             TestFieldSummary(False,
+                                              
Conversions.to_byte_buffer(StringType.get(), 'a'),
+                                              
Conversions.to_byte_buffer(StringType.get(), 'z'))
+                             ))
+
+
[email protected](scope="session")
+def inc_man_file_ns():
+    return TestManifestFile("manifest-list.avro", 1024, 0, int(time.time() * 
1000), None, None, None, None)
diff --git a/python/tests/api/expressions/test_evaluator.py 
b/python/tests/api/expressions/test_evaluator.py
index ec3d08f..b5820f4 100644
--- a/python/tests/api/expressions/test_evaluator.py
+++ b/python/tests/api/expressions/test_evaluator.py
@@ -15,11 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
+
 import iceberg.api.expressions as exp
 from iceberg.api.types import (IntegerType,
                                NestedField,
                                StringType,
                                StructType)
+from iceberg.exceptions import ValidationException
+from pytest import raises
 
 STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()),
                        NestedField.required(14, "y", IntegerType.get()),
@@ -123,6 +126,21 @@ def test_not(row_of):
     assert evaluator.eval(row_of((8,)))
 
 
+def test_case_insensitive_not(row_of):
+    evaluator = exp.evaluator.Evaluator(STRUCT,
+                                        
exp.expressions.Expressions.not_(exp.expressions.Expressions.equal("X", 7)),
+                                        case_sensitive=False)
+    assert not evaluator.eval(row_of((7,)))
+    assert evaluator.eval(row_of((8,)))
+
+
+def test_case_sensitive_not():
+    with raises(ValidationException):
+        exp.evaluator.Evaluator(STRUCT,
+                                
exp.expressions.Expressions.not_(exp.expressions.Expressions.equal("X", 7)),
+                                case_sensitive=True)
+
+
 def test_char_seq_value(row_of):
     struct = StructType.of([NestedField.required(34, "s", StringType.get())])
     evaluator = exp.evaluator.Evaluator(struct, 
exp.expressions.Expressions.equal("s", "abc"))
diff --git a/python/tests/api/expressions/test_expression_binding.py 
b/python/tests/api/expressions/test_expression_binding.py
index 47f6fea..f6a4d98 100644
--- a/python/tests/api/expressions/test_expression_binding.py
+++ b/python/tests/api/expressions/test_expression_binding.py
@@ -53,6 +53,18 @@ def test_single_reference(assert_all_bound):
                      Binder.bind(STRUCT, expr))
 
 
+def test_case_insensitive_reference(assert_all_bound):
+    expr = Expressions.not_(Expressions.equal("X", 7))
+    assert_all_bound("Single reference",
+                     Binder.bind(STRUCT, expr, case_sensitive=False))
+
+
+def test_case_sensitive_reference():
+    with raises(ice_ex.ValidationException):
+        expr = Expressions.not_(Expressions.equal("X", 7))
+        Binder.bind(STRUCT, expr, case_sensitive=True)
+
+
 def test_multiple_references(assert_all_bound):
     expr = Expressions.or_(Expressions.and_(Expressions.equal("x", 7),
                                             Expressions.less_than("y", 100)),
diff --git a/python/tests/api/expressions/test_inclusive_metrics_evaluator.py 
b/python/tests/api/expressions/test_inclusive_metrics_evaluator.py
index fcbef1e..e9f0473 100644
--- a/python/tests/api/expressions/test_inclusive_metrics_evaluator.py
+++ b/python/tests/api/expressions/test_inclusive_metrics_evaluator.py
@@ -17,6 +17,7 @@
 
 from iceberg.api.expressions import (Expressions,
                                      InclusiveMetricsEvaluator)
+from iceberg.exceptions import ValidationException
 from pytest import raises
 
 
@@ -116,3 +117,14 @@ def test_integer_not_eq(schema, file, not_eq):
 
 def test_not_eq_rewritten(schema, file, not_eq_rewrite):
     assert InclusiveMetricsEvaluator(schema, 
Expressions.not_(not_eq_rewrite)).eval(file)
+
+
+def test_case_insensitive_int_not_eq_rewritten(schema, file, not_eq_uc):
+    assert InclusiveMetricsEvaluator(schema, Expressions.not_(not_eq_uc),
+                                     case_sensitive=False).eval(file)
+
+
+def test_case_sensitive_int_not_eq_rewritten(schema, file, not_eq_uc):
+    with raises(ValidationException):
+        assert InclusiveMetricsEvaluator(schema, Expressions.not_(not_eq_uc),
+                                         case_sensitive=True).eval(file)

[incubator-iceberg] branch master updated: Backporting case-sensitivity flag to expression module (#196)

Reply via email to