(doris) branch master updated: [improvement](iceberg) remove extra spark-shell bootstrap (#61649)

morningman Tue, 24 Mar 2026 22:17:45 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 400cc6b82e7 [improvement](iceberg) remove extra spark-shell bootstrap 
(#61649)
400cc6b82e7 is described below

commit 400cc6b82e781f2a53177bc060502d9e67e6906d
Author: Chenjunwei <[email protected]>
AuthorDate: Wed Mar 25 13:17:31 2026 +0800

    [improvement](iceberg) remove extra spark-shell bootstrap (#61649)
    
    ## What
    - Remove the extra spark-shell bootstrap from the spark-iceberg
    container startup
    - Move deletion-vector seed data into a new aggregated spark-sql script
    - Delete the unused scala bootstrap script
    
    ## Why
    - External regression pays this spark-iceberg startup cost on every
    environment bootstrap
    - Keeping the deletion-vector seed data inside the existing spark-sql
    batch avoids one extra heavyweight Spark session
    
    ## Testing
    - `bash -n docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl`
    - Not run docker-based regression tests in this environment
---
 .../docker-compose/iceberg/entrypoint.sh.tpl       |  10 --
 .../create_preinstalled_scripts/iceberg/run28.sql  |  98 ++++++++++++
 .../iceberg_scala/run01.scala                      | 167 ---------------------
 3 files changed, 98 insertions(+), 177 deletions(-)

diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl 
b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
index 85863e04b75..a722514bbb4 100644
--- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
+++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
@@ -55,16 +55,6 @@ EXECUTION_TIME2=$((END_TIME2 - START_TIME2))
 echo "Script paimon total: {} executed in $EXECUTION_TIME2 seconds"
 
 
-
-ls /mnt/scripts/create_preinstalled_scripts/iceberg_scala/*.scala | xargs -n 1 
-I {} bash -c '
-    START_TIME=$(date +%s)
-    spark-shell --conf 
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
 -I {} 
-    END_TIME=$(date +%s)
-    EXECUTION_TIME=$((END_TIME - START_TIME))
-    echo "Script: {} executed in $EXECUTION_TIME seconds"
-'
-
-
 START_TIME3=$(date +%s)
 find /mnt/scripts/create_preinstalled_scripts/iceberg_load -name '*.sql' | sed 
's|^|source |' | sed 's|$|;|'> iceberg_load_total.sql
 spark-sql --master spark://doris--spark-iceberg:7077 --conf 
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
 -f iceberg_load_total.sql 
diff --git 
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run28.sql
 
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run28.sql
new file mode 100644
index 00000000000..adbfafcf976
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run28.sql
@@ -0,0 +1,98 @@
+create database if not exists format_v3;
+use format_v3;
+
+-- Keep deletion-vector test data in the aggregated spark-sql bootstrap flow
+-- so the iceberg container does not need an additional spark-shell session.
+drop table if exists dv_test;
+create table dv_test (
+    id int,
+    batch int,
+    data string
+)
+using iceberg
+tblproperties (
+    'format-version' = '3',
+    'write.delete.mode' = 'merge-on-read',
+    'write.update.mode' = 'merge-on-read',
+    'write.merge.mode' = 'merge-on-read'
+);
+
+insert into dv_test values
+    (1, 1, 'a'), (2, 1, 'b'), (3, 1, 'c'), (4, 1, 'd'),
+    (5, 1, 'e'), (6, 1, 'f'), (7, 1, 'g'), (8, 1, 'h');
+
+delete from dv_test
+where batch = 1 and id in (3, 4, 5);
+
+insert into dv_test values
+    (9, 2, 'i'), (10, 2, 'j'), (11, 2, 'k'), (12, 2, 'l'),
+    (13, 2, 'm'), (14, 2, 'n'), (15, 2, 'o'), (16, 2, 'p');
+
+delete from dv_test
+where batch = 2 and id >= 14;
+
+delete from dv_test
+where id % 2 = 1;
+
+drop table if exists dv_test_v2;
+create table dv_test_v2 (
+    id int,
+    batch int,
+    data string
+)
+using iceberg
+tblproperties (
+    'format-version' = '2',
+    'write.delete.mode' = 'merge-on-read',
+    'write.update.mode' = 'merge-on-read',
+    'write.merge.mode' = 'merge-on-read'
+);
+
+insert into dv_test_v2 values
+    (1, 1, 'a'), (2, 1, 'b'), (3, 1, 'c'), (4, 1, 'd'),
+    (5, 1, 'e'), (6, 1, 'f'), (7, 1, 'g'), (8, 1, 'h');
+
+delete from dv_test_v2
+where batch = 1 and id in (3, 4, 5);
+
+alter table dv_test_v2
+set tblproperties ('format-version' = '3');
+
+delete from dv_test_v2
+where id % 2 = 1;
+
+drop table if exists dv_test_1w;
+create table dv_test_1w (
+    id bigint,
+    grp int,
+    value int,
+    ts timestamp
+)
+using iceberg
+tblproperties (
+    'format-version' = '3',
+    'write.delete.mode' = 'merge-on-read',
+    'write.update.mode' = 'merge-on-read',
+    'write.merge.mode' = 'merge-on-read',
+    'write.parquet.row-group-size-bytes' = '10240'
+);
+
+insert into dv_test_1w
+select /*+ REPARTITION(10) */
+    id,
+    cast(id % 100 as int) as grp,
+    cast(rand(20260324) * 1000 as int) as value,
+    timestamp '2025-01-01 00:00:00' as ts
+from range(0, 100000);
+
+set spark.sql.shuffle.partitions = 1;
+set spark.sql.adaptive.enabled = false;
+
+delete from dv_test_1w
+where id % 2 = 1;
+
+delete from dv_test_1w
+where id % 3 = 1;
+
+reset spark.sql.shuffle.partitions;
+reset spark.sql.adaptive.enabled;
diff --git 
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg_scala/run01.scala
 
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg_scala/run01.scala
deleted file mode 100644
index f557ac9e2a7..00000000000
--- 
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg_scala/run01.scala
+++ /dev/null
@@ -1,167 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-val dbName    = "format_v3"
-val tableName = "dv_test"
-val fullTable = s"$dbName.$tableName"
-
-spark.sql(s"CREATE DATABASE IF NOT EXISTS $dbName")
-
-
-spark.sql(s""" drop table if exists $fullTable """)
-spark.sql(s"""
-CREATE TABLE $fullTable (
-  id INT,
-  batch INT,
-  data STRING
-)
-USING iceberg
-TBLPROPERTIES (
-  'format-version' = '3',
-  'write.delete.mode' = 'merge-on-read',
-  'write.update.mode' = 'merge-on-read',
-  'write.merge.mode'  = 'merge-on-read'
-)
-""")
-
-import spark.implicits._
-
-val batch1 = Seq(
-  (1, 1, "a"), (2, 1, "b"), (3, 1, "c"), (4, 1, "d"),
-  (5, 1, "e"), (6, 1, "f"), (7, 1, "g"), (8, 1, "h")
-).toDF("id", "batch", "data")
- .coalesce(1)
-
-batch1.writeTo(fullTable).append()
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE batch = 1 AND id IN (3, 4, 5)
-""")
-
-val batch2 = Seq(
-  (9,  2, "i"), (10, 2, "j"), (11, 2, "k"), (12, 2, "l"),
-  (13, 2, "m"), (14, 2, "n"), (15, 2, "o"), (16, 2, "p")
-).toDF("id", "batch", "data")
- .coalesce(1)
-
-batch2.writeTo(fullTable).append()
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE batch = 2 AND id >= 14
-""")
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id % 2 = 1
-""")
-
-
-// spark.sql(s""" select count(*) from $fullTable """).show()
-
-
-// v2 to v3.
-
-val tableName = "dv_test_v2"
-val fullTable = s"$dbName.$tableName"
-spark.sql(s""" drop table if exists $fullTable """)
-
-spark.sql(s"""
-CREATE TABLE $fullTable (
-  id INT,
-  batch INT,
-  data STRING
-)
-USING iceberg
-TBLPROPERTIES (
-  'format-version' = '2',
-  'write.delete.mode' = 'merge-on-read',
-  'write.update.mode' = 'merge-on-read',
-  'write.merge.mode'  = 'merge-on-read'
-)
-""")
-
-batch1.writeTo(fullTable).append()
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE batch = 1 AND id IN (3, 4, 5)
-""")
-
-spark.sql(s"""
-ALTER TABLE $fullTable
-SET TBLPROPERTIES ('format-version' = '3')
-""")
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id % 2 = 1
-""")
-
-
-// spark.sql(s""" select * from $fullTable order by id """).show()
-
-
-val tableName = "dv_test_1w"
-val fullTable = s"$dbName.$tableName"
-spark.sql(s""" drop table if exists $fullTable """)
-
-spark.sql(s"""
-CREATE TABLE $fullTable (
-  id        BIGINT,
-  grp       INT,
-  value     INT,
-  ts        TIMESTAMP
-)
-USING iceberg
-TBLPROPERTIES (
-  'format-version'='3',
-  'write.delete.mode'='merge-on-read',
-  'write.update.mode'='merge-on-read',
-  'write.merge.mode'='merge-on-read',
-  'write.parquet.row-group-size-bytes'='10240'
-)
-""")
-
-import org.apache.spark.sql.functions._
-
-val df = spark.range(0, 100000).select(
-    col("id"),
-    (col("id") % 100).cast("int").as("grp"),
-    (rand() * 1000).cast("int").as("value"),
-    current_timestamp().as("ts")
-  )
-
-df.repartition(10).writeTo(fullTable).append()
-
-
-spark.conf.set("spark.sql.shuffle.partitions", "1")
-spark.conf.set("spark.sql.adaptive.enabled", "false")
-
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id%2 = 1
-""")
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id%3 = 1
-""")
-
-// spark.sql(s""" select count(*) from $fullTable """).show()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [improvement](iceberg) remove extra spark-shell bootstrap (#61649)

Reply via email to