This is an automated email from the ASF dual-hosted git repository.
ayushsaxena pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 329ce884e77 HIVE-29230: Iceberg: Reads fails after Schema evolution
with complex type columns (#6106)
329ce884e77 is described below
commit 329ce884e77631803b156b2855efd8f978dee686
Author: Ayush Saxena <[email protected]>
AuthorDate: Wed Oct 1 11:30:30 2025 +0530
HIVE-29230: Iceberg: Reads fails after Schema evolution with complex type
columns (#6106)
---
.../queries/positive/iceberg_add_complex_column.q | 17 +++++
.../positive/iceberg_add_complex_column.q.out | 76 ++++++++++++++++++++++
.../vector/VectorizedParquetRecordReader.java | 12 ++--
3 files changed, 100 insertions(+), 5 deletions(-)
diff --git
a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_add_complex_column.q
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_add_complex_column.q
new file mode 100644
index 00000000000..d206fd8e76c
--- /dev/null
+++
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_add_complex_column.q
@@ -0,0 +1,17 @@
+CREATE TABLE t_complex (id INT) STORED BY ICEBERG;
+
+INSERT INTO t_complex (id) VALUES (1);
+
+ALTER TABLE t_complex ADD COLUMNS (col1 STRUCT<x:INT, y:INT>);
+
+INSERT INTO t_complex VALUES (2, named_struct("x", 10, "y", 20));
+
+ALTER TABLE t_complex ADD COLUMNS (col2 map<string,string>);
+
+INSERT INTO t_complex VALUES (3, named_struct("x", 11, "y", 22), map("k1",
"v1", "k2", "v2"));
+
+ALTER TABLE t_complex ADD COLUMNS (col3 array<int>);
+
+INSERT INTO t_complex VALUES (4, named_struct("x", 5, "y", 18), map("k22",
"v22", "k33", "v44"), array(1, 2, 3));
+
+SELECT * FROM t_complex ORDER BY id;
\ No newline at end of file
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/iceberg_add_complex_column.q.out
b/iceberg/iceberg-handler/src/test/results/positive/iceberg_add_complex_column.q.out
new file mode 100644
index 00000000000..80ef8542a89
--- /dev/null
+++
b/iceberg/iceberg-handler/src/test/results/positive/iceberg_add_complex_column.q.out
@@ -0,0 +1,76 @@
+PREHOOK: query: CREATE TABLE t_complex (id INT) STORED BY ICEBERG
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: CREATE TABLE t_complex (id INT) STORED BY ICEBERG
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: INSERT INTO t_complex (id) VALUES (1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: INSERT INTO t_complex (id) VALUES (1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: ALTER TABLE t_complex ADD COLUMNS (col1 STRUCT<x:INT, y:INT>)
+PREHOOK: type: ALTERTABLE_ADDCOLS
+PREHOOK: Input: default@t_complex
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: ALTER TABLE t_complex ADD COLUMNS (col1 STRUCT<x:INT, y:INT>)
+POSTHOOK: type: ALTERTABLE_ADDCOLS
+POSTHOOK: Input: default@t_complex
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: INSERT INTO t_complex VALUES (2, named_struct("x", 10, "y",
20))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: INSERT INTO t_complex VALUES (2, named_struct("x", 10, "y",
20))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: ALTER TABLE t_complex ADD COLUMNS (col2 map<string,string>)
+PREHOOK: type: ALTERTABLE_ADDCOLS
+PREHOOK: Input: default@t_complex
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: ALTER TABLE t_complex ADD COLUMNS (col2 map<string,string>)
+POSTHOOK: type: ALTERTABLE_ADDCOLS
+POSTHOOK: Input: default@t_complex
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: INSERT INTO t_complex VALUES (3, named_struct("x", 11, "y",
22), map("k1", "v1", "k2", "v2"))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: INSERT INTO t_complex VALUES (3, named_struct("x", 11, "y",
22), map("k1", "v1", "k2", "v2"))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: ALTER TABLE t_complex ADD COLUMNS (col3 array<int>)
+PREHOOK: type: ALTERTABLE_ADDCOLS
+PREHOOK: Input: default@t_complex
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: ALTER TABLE t_complex ADD COLUMNS (col3 array<int>)
+POSTHOOK: type: ALTERTABLE_ADDCOLS
+POSTHOOK: Input: default@t_complex
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: INSERT INTO t_complex VALUES (4, named_struct("x", 5, "y",
18), map("k22", "v22", "k33", "v44"), array(1, 2, 3))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_complex
+POSTHOOK: query: INSERT INTO t_complex VALUES (4, named_struct("x", 5, "y",
18), map("k22", "v22", "k33", "v44"), array(1, 2, 3))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_complex
+PREHOOK: query: SELECT * FROM t_complex ORDER BY id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_complex
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT * FROM t_complex ORDER BY id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_complex
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 NULL NULL NULL
+2 {"x":10,"y":20} NULL NULL
+3 {"x":11,"y":22} {"k1":"v1","k2":"v2"} NULL
+4 {"x":5,"y":18} {"k22":"v22","k33":"v44"} [1,2,3]
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
index a7ab49c43df..8ce85db5e33 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
@@ -519,20 +519,22 @@ private VectorizedColumnReader
buildVectorizedParquetReader(
int depth) throws IOException {
List<ColumnDescriptor> descriptors =
getAllColumnDescriptorByType(depth, type, columnDescriptors);
+ // Support for schema evolution: if the column from the current
+ // query schema is not present in the file schema, return a dummy
+ // reader that produces nulls. This allows queries to proceed even
+ // when new columns have been added after the file was written.
+ if (!fileSchema.getColumns().contains(descriptors.get(0))) {
+ return new VectorizedDummyColumnReader();
+ }
switch (typeInfo.getCategory()) {
case PRIMITIVE:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException(
"Failed to find related Parquet column descriptor with type " +
type);
}
- if (fileSchema.getColumns().contains(descriptors.get(0))) {
return new VectorizedPrimitiveColumnReader(descriptors.get(0),
pages.getPageReader(descriptors.get(0)), skipTimestampConversion,
writerTimezone, skipProlepticConversion,
legacyConversionEnabled, type, typeInfo);
- } else {
- // Support for schema evolution
- return new VectorizedDummyColumnReader();
- }
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();