This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 88cb9c19083ae8c2bc70d373a1a70384d476b9cd
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Thu Oct 17 13:41:46 2024 +0200

    IMPALA-13463: Impala should ignore case of Iceberg schema elements
    
    Schema is case insensitive in Impala. Via Spark it's possible to create
    schema elements with upper/lower case letters and store them in the
    metadata JSON files of Iceberg, e.g.:
       "schemas" : [ {
         "type" : "struct",
         "schema-id" : 0,
         "fields" : [ {
           "id" : 1,
           "name" : "ID",
           "required" : false,
           "type" : "string"
         }, {
           "id" : 2,
           "name" : "OWNERID",
           "required" : false,
           "type" : "string"
         } ]
       } ],
    
    This can cause problems in Impala during predicate pushdown, as we can
    get a ValidationException from the Iceberg library (as Impala pushes
    down predicates with lower case column names, while Iceberg sees upper
    case names).
    
    With this patch Impala invokes Scan.caseSensitive(boolean caseSensitive)
    on the TableScan object to set case insensitivity.
    
    Testing:
     * added e2e test
    
    Change-Id: Iedaf152d8a0c02a124c3dcf8acb59b4ba4e81cf4
    Reviewed-on: http://gerrit.cloudera.org:8080/21950
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Wenzhe Zhou <[email protected]>
    Reviewed-by: Daniel Becker <[email protected]>
---
 .../java/org/apache/impala/util/IcebergUtil.java   |   9 +-
 ...e91c0129-f018b1d800000000_872469098_data.0.parq | Bin 0 -> 605 bytes
 ...97c4c65-c9fce43a00000000_1852333400_data.0.parq | Bin 0 -> 591 bytes
 .../1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro   | Bin 0 -> 6039 bytes
 .../96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro   | Bin 0 -> 6034 bytes
 ...667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro | Bin 0 -> 3875 bytes
 ...468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro | Bin 0 -> 3797 bytes
 .../metadata/v3.metadata.json                      | 127 +++++++++++++++++++++
 .../metadata/version-hint.text                     |   1 +
 .../iceberg-column-case-sensitivity-issue.test     |  20 ++++
 tests/query_test/test_iceberg.py                   |   6 +
 11 files changed, 161 insertions(+), 2 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java 
b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
index 3bacdfd3a..5ad64f71e 100644
--- a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
+++ b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
@@ -655,7 +655,7 @@ public class IcebergUtil {
 
   private static TableScan createScanAsOf(FeIcebergTable table,
       TimeTravelSpec timeTravelSpec) {
-    TableScan scan = table.getIcebergApiTable().newScan();
+    TableScan scan = newScan(table);
     if (timeTravelSpec == null) {
       scan = scan.useSnapshot(table.snapshotId());
     } else {
@@ -687,7 +687,7 @@ public class IcebergUtil {
     if (table.snapshotId() == -1) {
       return new GroupedContentFiles(CloseableIterable.empty());
     }
-    TableScan scan = table.getIcebergApiTable().newScan();
+    TableScan scan = newScan(table);
     scan = scan.useSnapshot(snapshotId);
     for (Expression predicate : predicates) {
       scan = scan.filter(predicate);
@@ -699,6 +699,11 @@ public class IcebergUtil {
     }
   }
 
+  private static TableScan newScan(FeIcebergTable table) {
+    TableScan scan = table.getIcebergApiTable().newScan();
+    return scan.caseSensitive(false);
+  }
+
   /**
    * Use ContentFile path to generate 128-bit Murmur3 hash as map key, cached 
in memory
    */
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=3/5b4ef6d2e91c0129-f018b1d800000000_872469098_data.0.parq
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=3/5b4ef6d2e91c0129-f018b1d800000000_872469098_data.0.parq
new file mode 100644
index 000000000..2b5988bf9
Binary files /dev/null and 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=3/5b4ef6d2e91c0129-f018b1d800000000_872469098_data.0.parq
 differ
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=7/504c5f5ae97c4c65-c9fce43a00000000_1852333400_data.0.parq
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=7/504c5f5ae97c4c65-c9fce43a00000000_1852333400_data.0.parq
new file mode 100644
index 000000000..f70cd0c84
Binary files /dev/null and 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=7/504c5f5ae97c4c65-c9fce43a00000000_1852333400_data.0.parq
 differ
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro
new file mode 100644
index 000000000..a27606f92
Binary files /dev/null and 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro
 differ
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro
new file mode 100644
index 000000000..704686951
Binary files /dev/null and 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro
 differ
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro
new file mode 100644
index 000000000..a336f60d1
Binary files /dev/null and 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro
 differ
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro
new file mode 100644
index 000000000..f5e8d3bd7
Binary files /dev/null and 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro
 differ
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/v3.metadata.json
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/v3.metadata.json
new file mode 100644
index 000000000..0095d2b7f
--- /dev/null
+++ 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/v3.metadata.json
@@ -0,0 +1,127 @@
+{
+  "format-version" : 1,
+  "table-uuid" : "6259114b-df40-4263-9375-4bd3102965d2",
+  "location" : 
"hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue",
+  "last-updated-ms" : 1729164485534,
+  "last-column-id" : 2,
+  "schema" : {
+    "type" : "struct",
+    "schema-id" : 0,
+    "fields" : [ {
+      "id" : 1,
+      "name" : "ID",
+      "required" : false,
+      "type" : "string"
+    }, {
+      "id" : 2,
+      "name" : "OWNERID",
+      "required" : false,
+      "type" : "string"
+    } ]
+  },
+  "current-schema-id" : 0,
+  "schemas" : [ {
+    "type" : "struct",
+    "schema-id" : 0,
+    "fields" : [ {
+      "id" : 1,
+      "name" : "ID",
+      "required" : false,
+      "type" : "string"
+    }, {
+      "id" : 2,
+      "name" : "OWNERID",
+      "required" : false,
+      "type" : "string"
+    } ]
+  } ],
+  "partition-spec" : [ {
+    "name" : "ID_bucket",
+    "transform" : "bucket[16]",
+    "source-id" : 1,
+    "field-id" : 1000
+  } ],
+  "default-spec-id" : 0,
+  "partition-specs" : [ {
+    "spec-id" : 0,
+    "fields" : [ {
+      "name" : "ID_bucket",
+      "transform" : "bucket[16]",
+      "source-id" : 1,
+      "field-id" : 1000
+    } ]
+  } ],
+  "last-partition-id" : 1000,
+  "default-sort-order-id" : 0,
+  "sort-orders" : [ {
+    "order-id" : 0,
+    "fields" : [ ]
+  } ],
+  "properties" : {
+    "engine.hive.enabled" : "true",
+    "OBJCAPABILITIES" : "EXTREAD,EXTWRITE",
+    "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler",
+    "write.format.default" : "parquet",
+    "iceberg.catalog" : "hadoop.tables"
+  },
+  "current-snapshot-id" : 1855055649619147667,
+  "refs" : {
+    "main" : {
+      "snapshot-id" : 1855055649619147667,
+      "type" : "branch"
+    }
+  },
+  "snapshots" : [ {
+    "snapshot-id" : 7743982156242154468,
+    "timestamp-ms" : 1729164477675,
+    "summary" : {
+      "operation" : "append",
+      "added-data-files" : "1",
+      "added-records" : "1",
+      "added-files-size" : "605",
+      "changed-partition-count" : "1",
+      "total-records" : "1",
+      "total-files-size" : "605",
+      "total-data-files" : "1",
+      "total-delete-files" : "0",
+      "total-position-deletes" : "0",
+      "total-equality-deletes" : "0"
+    },
+    "manifest-list" : 
"hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro",
+    "schema-id" : 0
+  }, {
+    "snapshot-id" : 1855055649619147667,
+    "parent-snapshot-id" : 7743982156242154468,
+    "timestamp-ms" : 1729164485534,
+    "summary" : {
+      "operation" : "append",
+      "added-data-files" : "1",
+      "added-records" : "1",
+      "added-files-size" : "591",
+      "changed-partition-count" : "1",
+      "total-records" : "2",
+      "total-files-size" : "1196",
+      "total-data-files" : "2",
+      "total-delete-files" : "0",
+      "total-position-deletes" : "0",
+      "total-equality-deletes" : "0"
+    },
+    "manifest-list" : 
"hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro",
+    "schema-id" : 0
+  } ],
+  "statistics" : [ ],
+  "snapshot-log" : [ {
+    "timestamp-ms" : 1729164477675,
+    "snapshot-id" : 7743982156242154468
+  }, {
+    "timestamp-ms" : 1729164485534,
+    "snapshot-id" : 1855055649619147667
+  } ],
+  "metadata-log" : [ {
+    "timestamp-ms" : 1729164453998,
+    "metadata-file" : 
"hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/v1.metadata.json"
+  }, {
+    "timestamp-ms" : 1729164477675,
+    "metadata-file" : 
"hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/v2.metadata.json"
+  } ]
+}
diff --git 
a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/version-hint.text
 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/version-hint.text
new file mode 100644
index 000000000..e440e5c84
--- /dev/null
+++ 
b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/version-hint.text
@@ -0,0 +1 @@
+3
\ No newline at end of file
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-column-case-sensitivity-issue.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-column-case-sensitivity-issue.test
new file mode 100644
index 000000000..a1907b1ac
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-column-case-sensitivity-issue.test
@@ -0,0 +1,20 @@
+====
+---- QUERY
+select * from iceberg_column_case_sensitivity_issue;
+---- RESULTS
+'1','impala'
+'2','hive'
+---- TYPES
+STRING, STRING
+====
+---- QUERY
+# Check that predicate pushdown works well
+select * from iceberg_column_case_sensitivity_issue
+where id = '1';
+---- RESULTS
+'1','impala'
+---- TYPES
+STRING, STRING
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 663493369..0cc5e72a7 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -271,6 +271,12 @@ class TestIcebergTable(IcebergTestSuite):
     self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution',
                        vector, unique_database)
 
+  def test_column_case_sensitivity(self, vector, unique_database):
+    create_iceberg_table_from_directory(self.client, unique_database,
+        "iceberg_column_case_sensitivity_issue", "parquet")
+    self.run_test_case('QueryTest/iceberg-column-case-sensitivity-issue',
+                       vector, unique_database)
+
   @SkipIfFS.hive
   def test_migrated_table_field_id_resolution_complex(self, vector, 
unique_database):
     def get_table_loc(tbl_name):

Reply via email to