(doris) 02/02: [fix](iceberg) Fill row-id hidden column for format-version 1 tables

daidai Wed, 03 Jun 2026 02:17:16 -0700

This is an automated email from the ASF dual-hosted git repository.

hubgeter pushed a commit to branch iceberg-v3-test-branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


commit ec5b969d464781259817a3caf25719242129a89c
Author: daidai <[email protected]>
AuthorDate: Wed Jun 3 17:09:25 2026 +0800

    [fix](iceberg) Fill row-id hidden column for format-version 1 tables
---
 be/src/format/table/iceberg_reader.cpp             |  18 ++-
 .../test_iceberg_v1_rowid_hidden_column.groovy     | 175 +++++++++++++++++++++
 2 files changed, 188 insertions(+), 5 deletions(-)

diff --git a/be/src/format/table/iceberg_reader.cpp 
b/be/src/format/table/iceberg_reader.cpp
index 5e1ee2d6cb2..5b90cac648d 100644
--- a/be/src/format/table/iceberg_reader.cpp
+++ b/be/src/format/table/iceberg_reader.cpp
@@ -205,16 +205,19 @@ Status IcebergTableReader::init_row_filters() {
     }
 
     const auto& table_desc = _range.table_format_params.iceberg_params;
-    const auto& version = table_desc.format_version;
-    if (version < MIN_SUPPORT_DELETE_FILES_VERSION) {
-        return Status::OK();
-    }
 
     auto* parquet_reader = 
dynamic_cast<ParquetReader*>(_file_format_reader.get());
     auto* orc_reader = dynamic_cast<OrcReader*>(_file_format_reader.get());
 
     // Initialize file information for $row_id generation
-    // Extract from table_desc which contains current file's metadata
+    // Extract from table_desc which contains current file's metadata.
+    // NOTE: row-id generation only needs the data file path / partition info 
/ row positions,
+    // which are independent of delete-file support, so it MUST be set up 
before the
+    // format-version gate below. The FE adds the hidden 
__DORIS_ICEBERG_ROWID_COL__ column
+    // whenever show_hidden_columns is on, regardless of format version (see
+    // IcebergExternalTable.getFullSchema). If a v1 table selects this column 
we still have to
+    // fill it; otherwise it stays empty while the other columns are filtered 
down, tripping the
+    // `block->rows() == col.column->size()` check in 
RowGroupReader::_do_lazy_read.
     if (_need_row_id_column) {
         std::string file_path = table_desc.original_file_path;
         int32_t partition_spec_id = 0;
@@ -237,6 +240,11 @@ Status IcebergTableReader::init_row_filters() {
                   << ", partition_spec_id: " << partition_spec_id;
     }
 
+    const auto& version = table_desc.format_version;
+    if (version < MIN_SUPPORT_DELETE_FILES_VERSION) {
+        return Status::OK();
+    }
+
     std::vector<TIcebergDeleteFileDesc> position_delete_files;
     std::vector<TIcebergDeleteFileDesc> equality_delete_files;
     std::vector<TIcebergDeleteFileDesc> deletion_vector_files;
diff --git 
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v1_rowid_hidden_column.groovy
 
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_v1_rowid_hidden_column.groovy
new file mode 100644
index 00000000000..bc8774b3160
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_v1_rowid_hidden_column.groovy
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Regression for a BE crash when reading the hidden Iceberg row-id column on a
+// format-version 1 table:
+//
+//   F ... vparquet_group_reader.cpp:787 Check failed: block->rows() == 
col.column->size()
+//         block rows = 1 , column rows = 0, col name = 
__DORIS_ICEBERG_ROWID_COL__
+//
+// The FE adds the hidden __DORIS_ICEBERG_ROWID_COL__ column whenever
+// show_hidden_columns=true, regardless of the Iceberg format version
+// (IcebergExternalTable.getFullSchema). The BE, however, only enabled row-id
+// generation for format-version >= 2, because 
IcebergTableReader::init_row_filters
+// set up the row-id params *after* the delete-files version gate
+// (version < MIN_SUPPORT_DELETE_FILES_VERSION returns early). On a v1 table a
+// SELECT * therefore left the row-id column empty while every other column was
+// filtered/filled to the surviving row count, tripping the consistency check 
in
+// RowGroupReader::_do_lazy_read.
+//
+// The fix moves the row-id setup ahead of the version gate, so this test must
+// return correct results (and not crash the BE) for v1 tables.
+suite("test_iceberg_v1_rowid_hidden_column", 
"p0,external,iceberg,external_docker,external_docker_iceberg") {
+    String enabled = context.config.otherConfigs.get("enableIcebergTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("Iceberg test is disabled")
+        return
+    }
+
+    String catalogName = "test_iceberg_v1_rowid_hidden_column"
+    String dbName = "test_iceberg_v1_rowid_db"
+    String restPort = context.config.otherConfigs.get("iceberg_rest_uri_port")
+    String minioPort = context.config.otherConfigs.get("iceberg_minio_port")
+    String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+    String endpoint = "http://${externalEnvIp}:${minioPort}";
+
+    def descColumns = { rows -> rows.collect { row -> 
row[0].toString().toLowerCase() } }
+
+    sql """drop catalog if exists ${catalogName}"""
+    sql """
+        create catalog if not exists ${catalogName} properties (
+            "type" = "iceberg",
+            "iceberg.catalog.type" = "rest",
+            "uri" = "http://${externalEnvIp}:${restPort}";,
+            "s3.access_key" = "admin",
+            "s3.secret_key" = "password",
+            "s3.endpoint" = "${endpoint}",
+            "s3.region" = "us-east-1"
+        )
+    """
+
+    sql """switch ${catalogName}"""
+    sql """create database if not exists ${dbName}"""
+    sql """use ${dbName}"""
+    sql """set enable_fallback_to_original_planner = false"""
+    sql """set show_hidden_columns = false"""
+
+    // Run the full set of repro queries against one v1 table (3 visible 
columns: id, s, age).
+    def runChecks = { tbl, format ->
+        sql """
+            insert into ${tbl} values
+            (1, 'a', 25),
+            (2, 'a', 30),
+            (3, 'b', 35),
+            (4, 'c', 40)
+        """
+
+        // 1. Default visibility: the row-id hidden column is not exposed.
+        sql """set show_hidden_columns = false"""
+        def visible = sql """select * from ${tbl} order by id"""
+        assertEquals(4, visible.size())
+        assertEquals(3, visible[0].size())
+
+        // 2. show_hidden_columns=true exposes only 
__DORIS_ICEBERG_ROWID_COL__ on a v1 table
+        //    (v3 row lineage columns must NOT appear).
+        sql """set show_hidden_columns = true"""
+        def hiddenCols = descColumns(sql("""desc ${tbl}"""))
+        log.info("v1 desc with show_hidden_columns=true (${format}, ${tbl}) -> 
${hiddenCols}")
+        assertTrue(hiddenCols.any { it.contains("doris_iceberg_rowid") },
+                "show_hidden_columns should expose __DORIS_ICEBERG_ROWID_COL__ 
for v1 table, got ${hiddenCols}")
+        assertTrue(!hiddenCols.contains("_row_id"),
+                "v1 table must not expose v3 row lineage column _row_id, got 
${hiddenCols}")
+        assertTrue(!hiddenCols.contains("_last_updated_sequence_number"),
+                "v1 table must not expose v3 row lineage column 
_last_updated_sequence_number, got ${hiddenCols}")
+
+        // 3. Crash repro: SELECT * with a predicate on the string column that 
filters rows.
+        //    Before the fix the BE aborted here (row-id column left at 0 
rows).
+        def byString = sql """select * from ${tbl} where s = 'a' order by id"""
+        log.info("v1 SELECT * where s='a' (${format}, ${tbl}) -> ${byString}")
+        assertEquals(2, byString.size())
+        assertEquals(4, byString[0].size()) // id, s, age, 
__DORIS_ICEBERG_ROWID_COL__
+        assertEquals(1, byString[0][0].toString().toInteger())
+        assertEquals(2, byString[1][0].toString().toInteger())
+        assertTrue(byString[0][byString[0].size() - 1] != null,
+                "row-id column must be populated, got null: ${byString[0]}")
+
+        // 4. Crash repro: predicate on a data column (exercises lazy 
materialization, the path
+        //    in the reported stack) with other lazy columns in the projection.
+        def byData = sql """select * from ${tbl} where id > 1 order by id"""
+        log.info("v1 SELECT * where id>1 (${format}, ${tbl}) -> ${byData}")
+        assertEquals(3, byData.size())
+        assertEquals(4, byData[0].size())
+        assertEquals([2, 3, 4], byData.collect { it[0].toString().toInteger() 
})
+
+        // 5. Full scan with the hidden column also returns every row.
+        def all = sql """select * from ${tbl} order by id"""
+        assertEquals(4, all.size())
+        assertEquals(4, all[0].size())
+
+        sql """set show_hidden_columns = false"""
+    }
+
+    def formats = ["parquet", "orc"]
+    try {
+        formats.each { format ->
+            String unpartitioned = "test_iceberg_v1_rowid_${format}"
+            String partitioned = "test_iceberg_v1_rowid_part_${format}"
+            log.info("Run v1 row-id hidden-column test with format ${format}")
+            try {
+                // Unpartitioned v1 table: row-id with partition_spec_id = 0 / 
empty partition data.
+                sql """drop table if exists ${unpartitioned}"""
+                sql """
+                    create table ${unpartitioned} (
+                        id int,
+                        s string,
+                        age int
+                    ) engine=iceberg
+                    properties (
+                        "format-version" = "1",
+                        "write.format.default" = "${format}"
+                    )
+                """
+                runChecks(unpartitioned, format)
+
+                // Partitioned v1 table mirroring the original crash (a 
Spark-written partitioned
+                // v1 table), so the row-id struct's partition_spec_id / 
partition_data are exercised.
+                sql """drop table if exists ${partitioned}"""
+                sql """
+                    create table ${partitioned} (
+                        id int,
+                        s string,
+                        age int
+                    ) engine=iceberg
+                    partition by list (s) ()
+                    properties (
+                        "format-version" = "1",
+                        "write.format.default" = "${format}"
+                    )
+                """
+                runChecks(partitioned, format)
+            } finally {
+                sql """set show_hidden_columns = false"""
+                sql """drop table if exists ${partitioned}"""
+                sql """drop table if exists ${unpartitioned}"""
+            }
+        }
+    } finally {
+        sql """set show_hidden_columns = false"""
+        sql """drop database if exists ${dbName} force"""
+        sql """drop catalog if exists ${catalogName}"""
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) 02/02: [fix](iceberg) Fill row-id hidden column for format-version 1 tables

Reply via email to