This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch branch-4.4.0
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 99ce967ba60666adff5dd74fd38a06fee7f2c521
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Mon Apr 15 18:30:35 2024 +0200

    IMPALA-13002: Iceberg V2 tables with Avro delete files aren't read properly
    
    If the Iceberg table has Avro delete files (e.g. by setting
    'write.delete.format.default'='avro') then Impala won't be able to read
    the contents of the delete files properly. It is because the avro
    schema is not set properly for the virtual delete table.
    
    Testing:
     * added e2e tests with position delete files of all kinds
    
    Change-Id: Iff13198991caf32c51cd9e0ace4454fd00216cf6
    Reviewed-on: http://gerrit.cloudera.org:8080/21301
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Daniel Becker <[email protected]>
    Reviewed-by: Gabor Kaszab <[email protected]>
---
 .../apache/impala/catalog/IcebergDeleteTable.java  |   5 +
 .../iceberg-mixed-format-position-deletes.test     | 133 +++++++++++++++++++++
 tests/query_test/test_iceberg.py                   |   5 +
 3 files changed, 143 insertions(+)

diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java 
b/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java
index ed47c7690..40f2c2c94 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java
@@ -33,6 +33,7 @@ import org.apache.impala.thrift.TIcebergFileFormat;
 import org.apache.impala.thrift.TIcebergPartitionStats;
 import org.apache.impala.thrift.TTableDescriptor;
 import org.apache.impala.thrift.TTableStats;
+import org.apache.impala.util.AvroSchemaConverter;
 
 /**
  * Base class for the virtual table implementations for Iceberg deletes, like 
position or
@@ -83,6 +84,10 @@ public abstract class IcebergDeleteTable extends 
VirtualTable implements FeIcebe
       TTableDescriptor desc =
           baseTable_.toThriftDescriptor(tableId, referencedPartitions);
       desc.setColumnDescriptors(FeCatalogUtils.getTColumnDescriptors(this));
+      if (desc.hdfsTable.isSetAvroSchema()) {
+        
desc.hdfsTable.setAvroSchema(AvroSchemaConverter.convertColumns(getColumns(),
+            getFullName().replaceAll("-", "_")).toString());
+      }
       return desc;
     }
 
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-mixed-format-position-deletes.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-mixed-format-position-deletes.test
new file mode 100644
index 000000000..e1452a274
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-mixed-format-position-deletes.test
@@ -0,0 +1,133 @@
+====
+---- HIVE_QUERY
+use $DATABASE;
+CREATE TABLE ice_mixed_formats(i int, j int)
+STORED BY ICEBERG
+STORED AS PARQUET
+TBLPROPERTIES ('format-version'='2');
+INSERT INTO ice_mixed_formats VALUES (1, 1);
+DELETE FROM ice_mixed_formats WHERE i = 1;
+ALTER TABLE ice_mixed_formats SET TBLPROPERTIES ('write.format.default'='orc');
+INSERT INTO ice_mixed_formats VALUES (2, 2);
+DELETE FROM ice_mixed_formats WHERE i = 2;
+INSERT INTO ice_mixed_formats VALUES (3, 3);
+INSERT INTO ice_mixed_formats VALUES (10, 10);
+ALTER TABLE ice_mixed_formats SET TBLPROPERTIES 
('write.format.default'='avro');
+DELETE FROM ice_mixed_formats WHERE i = 3;
+====
+---- QUERY
+refresh ice_mixed_formats;
+====
+---- QUERY
+SHOW FILES IN ice_mixed_formats;
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-data-.*.parquet','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-delete-.*parquet','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-data-.*.orc','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-delete-.*orc','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-delete-.*.avro','.*B','','.*'
+---- TYPES
+STRING, STRING, STRING, STRING
+====
+---- QUERY
+select * from ice_mixed_formats;
+---- RESULTS
+10,10
+---- TYPES
+INT,INT
+====
+---- QUERY
+select count(*) from ice_mixed_formats;
+---- RESULTS
+1
+---- TYPES
+BIGINT
+====
+---- QUERY
+select * from ice_mixed_formats where i > 1;
+---- RESULTS
+10,10
+---- TYPES
+INT,INT
+====
+---- QUERY
+select count(*) from ice_mixed_formats where i > 1;
+---- RESULTS
+1
+---- TYPES
+BIGINT
+====
+---- QUERY
+select i, count(*) from ice_mixed_formats group by 1;
+---- RESULTS
+10,1
+---- TYPES
+INT,BIGINT
+====
+---- HIVE_QUERY
+use $DATABASE;
+CREATE TABLE ice_mixed_formats_partitioned(i int, j int)
+PARTITIONED BY SPEC (truncate(2, j))
+STORED BY ICEBERG
+STORED AS PARQUET
+TBLPROPERTIES ('format-version'='2');
+INSERT INTO ice_mixed_formats_partitioned VALUES (1, 1);
+DELETE FROM ice_mixed_formats_partitioned WHERE i = 1;
+ALTER TABLE ice_mixed_formats_partitioned SET TBLPROPERTIES 
('write.format.default'='orc');
+INSERT INTO ice_mixed_formats_partitioned VALUES (2, 2);
+DELETE FROM ice_mixed_formats_partitioned WHERE i = 2;
+INSERT INTO ice_mixed_formats_partitioned VALUES (3, 3);
+INSERT INTO ice_mixed_formats_partitioned VALUES (10, 10);
+ALTER TABLE ice_mixed_formats_partitioned SET TBLPROPERTIES 
('write.format.default'='avro');
+DELETE FROM ice_mixed_formats_partitioned WHERE i = 3;
+====
+---- QUERY
+refresh ice_mixed_formats_partitioned;
+====
+---- QUERY
+SHOW FILES IN ice_mixed_formats_partitioned;
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=0/.*-data-.*.parquet','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=0/.*-delete-.*parquet','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=2/.*-data-.*.orc','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=2/.*-delete-.*orc','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=10/.*-data-.*.orc','.*B','','.*'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=2/.*-delete-.*.avro','.*B','','.*'
+---- TYPES
+STRING, STRING, STRING, STRING
+====
+---- QUERY
+select * from ice_mixed_formats_partitioned;
+---- RESULTS
+10,10
+---- TYPES
+INT,INT
+====
+---- QUERY
+select count(*) from ice_mixed_formats_partitioned;
+---- RESULTS
+1
+---- TYPES
+BIGINT
+====
+---- QUERY
+select * from ice_mixed_formats_partitioned where i > 1;
+---- RESULTS
+10,10
+---- TYPES
+INT,INT
+====
+---- QUERY
+select count(*) from ice_mixed_formats_partitioned where i > 1;
+---- RESULTS
+1
+---- TYPES
+BIGINT
+====
+---- QUERY
+select i, count(*) from ice_mixed_formats_partitioned group by 1;
+---- RESULTS
+10,1
+---- TYPES
+INT,BIGINT
+====
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 6c2d646f9..2e715b160 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -1463,6 +1463,11 @@ class TestIcebergV2Table(IcebergTestSuite):
   def test_read_position_deletes(self, vector):
     self.run_test_case('QueryTest/iceberg-v2-read-position-deletes', vector)
 
+  @SkipIfFS.hive
+  def test_read_mixed_format_position_deletes(self, vector, unique_database):
+    self.run_test_case('QueryTest/iceberg-mixed-format-position-deletes',
+        vector, unique_database)
+
   @SkipIfDockerizedCluster.internal_hostname
   @SkipIf.hardcoded_uris
   def test_read_null_delete_records(self, vector):

Reply via email to