This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 400cc6b82e7 [improvement](iceberg) remove extra spark-shell bootstrap
(#61649)
400cc6b82e7 is described below
commit 400cc6b82e781f2a53177bc060502d9e67e6906d
Author: Chenjunwei <[email protected]>
AuthorDate: Wed Mar 25 13:17:31 2026 +0800
[improvement](iceberg) remove extra spark-shell bootstrap (#61649)
## What
- Remove the extra spark-shell bootstrap from the spark-iceberg
container startup
- Move deletion-vector seed data into a new aggregated spark-sql script
- Delete the unused scala bootstrap script
## Why
- External regression pays this spark-iceberg startup cost on every
environment bootstrap
- Keeping the deletion-vector seed data inside the existing spark-sql
batch avoids one extra heavyweight Spark session
## Testing
- `bash -n docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl`
- Not run docker-based regression tests in this environment
---
.../docker-compose/iceberg/entrypoint.sh.tpl | 10 --
.../create_preinstalled_scripts/iceberg/run28.sql | 98 ++++++++++++
.../iceberg_scala/run01.scala | 167 ---------------------
3 files changed, 98 insertions(+), 177 deletions(-)
diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
index 85863e04b75..a722514bbb4 100644
--- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
+++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
@@ -55,16 +55,6 @@ EXECUTION_TIME2=$((END_TIME2 - START_TIME2))
echo "Script paimon total: {} executed in $EXECUTION_TIME2 seconds"
-
-ls /mnt/scripts/create_preinstalled_scripts/iceberg_scala/*.scala | xargs -n 1
-I {} bash -c '
- START_TIME=$(date +%s)
- spark-shell --conf
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-I {}
- END_TIME=$(date +%s)
- EXECUTION_TIME=$((END_TIME - START_TIME))
- echo "Script: {} executed in $EXECUTION_TIME seconds"
-'
-
-
START_TIME3=$(date +%s)
find /mnt/scripts/create_preinstalled_scripts/iceberg_load -name '*.sql' | sed
's|^|source |' | sed 's|$|;|'> iceberg_load_total.sql
spark-sql --master spark://doris--spark-iceberg:7077 --conf
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-f iceberg_load_total.sql
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run28.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run28.sql
new file mode 100644
index 00000000000..adbfafcf976
--- /dev/null
+++
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run28.sql
@@ -0,0 +1,98 @@
+create database if not exists format_v3;
+use format_v3;
+
+-- Keep deletion-vector test data in the aggregated spark-sql bootstrap flow
+-- so the iceberg container does not need an additional spark-shell session.
+drop table if exists dv_test;
+create table dv_test (
+ id int,
+ batch int,
+ data string
+)
+using iceberg
+tblproperties (
+ 'format-version' = '3',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.merge.mode' = 'merge-on-read'
+);
+
+insert into dv_test values
+ (1, 1, 'a'), (2, 1, 'b'), (3, 1, 'c'), (4, 1, 'd'),
+ (5, 1, 'e'), (6, 1, 'f'), (7, 1, 'g'), (8, 1, 'h');
+
+delete from dv_test
+where batch = 1 and id in (3, 4, 5);
+
+insert into dv_test values
+ (9, 2, 'i'), (10, 2, 'j'), (11, 2, 'k'), (12, 2, 'l'),
+ (13, 2, 'm'), (14, 2, 'n'), (15, 2, 'o'), (16, 2, 'p');
+
+delete from dv_test
+where batch = 2 and id >= 14;
+
+delete from dv_test
+where id % 2 = 1;
+
+drop table if exists dv_test_v2;
+create table dv_test_v2 (
+ id int,
+ batch int,
+ data string
+)
+using iceberg
+tblproperties (
+ 'format-version' = '2',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.merge.mode' = 'merge-on-read'
+);
+
+insert into dv_test_v2 values
+ (1, 1, 'a'), (2, 1, 'b'), (3, 1, 'c'), (4, 1, 'd'),
+ (5, 1, 'e'), (6, 1, 'f'), (7, 1, 'g'), (8, 1, 'h');
+
+delete from dv_test_v2
+where batch = 1 and id in (3, 4, 5);
+
+alter table dv_test_v2
+set tblproperties ('format-version' = '3');
+
+delete from dv_test_v2
+where id % 2 = 1;
+
+drop table if exists dv_test_1w;
+create table dv_test_1w (
+ id bigint,
+ grp int,
+ value int,
+ ts timestamp
+)
+using iceberg
+tblproperties (
+ 'format-version' = '3',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.merge.mode' = 'merge-on-read',
+ 'write.parquet.row-group-size-bytes' = '10240'
+);
+
+insert into dv_test_1w
+select /*+ REPARTITION(10) */
+ id,
+ cast(id % 100 as int) as grp,
+ cast(rand(20260324) * 1000 as int) as value,
+ timestamp '2025-01-01 00:00:00' as ts
+from range(0, 100000);
+
+set spark.sql.shuffle.partitions = 1;
+set spark.sql.adaptive.enabled = false;
+
+delete from dv_test_1w
+where id % 2 = 1;
+
+delete from dv_test_1w
+where id % 3 = 1;
+
+reset spark.sql.shuffle.partitions;
+reset spark.sql.adaptive.enabled;
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg_scala/run01.scala
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg_scala/run01.scala
deleted file mode 100644
index f557ac9e2a7..00000000000
---
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg_scala/run01.scala
+++ /dev/null
@@ -1,167 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-val dbName = "format_v3"
-val tableName = "dv_test"
-val fullTable = s"$dbName.$tableName"
-
-spark.sql(s"CREATE DATABASE IF NOT EXISTS $dbName")
-
-
-spark.sql(s""" drop table if exists $fullTable """)
-spark.sql(s"""
-CREATE TABLE $fullTable (
- id INT,
- batch INT,
- data STRING
-)
-USING iceberg
-TBLPROPERTIES (
- 'format-version' = '3',
- 'write.delete.mode' = 'merge-on-read',
- 'write.update.mode' = 'merge-on-read',
- 'write.merge.mode' = 'merge-on-read'
-)
-""")
-
-import spark.implicits._
-
-val batch1 = Seq(
- (1, 1, "a"), (2, 1, "b"), (3, 1, "c"), (4, 1, "d"),
- (5, 1, "e"), (6, 1, "f"), (7, 1, "g"), (8, 1, "h")
-).toDF("id", "batch", "data")
- .coalesce(1)
-
-batch1.writeTo(fullTable).append()
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE batch = 1 AND id IN (3, 4, 5)
-""")
-
-val batch2 = Seq(
- (9, 2, "i"), (10, 2, "j"), (11, 2, "k"), (12, 2, "l"),
- (13, 2, "m"), (14, 2, "n"), (15, 2, "o"), (16, 2, "p")
-).toDF("id", "batch", "data")
- .coalesce(1)
-
-batch2.writeTo(fullTable).append()
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE batch = 2 AND id >= 14
-""")
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id % 2 = 1
-""")
-
-
-// spark.sql(s""" select count(*) from $fullTable """).show()
-
-
-// v2 to v3.
-
-val tableName = "dv_test_v2"
-val fullTable = s"$dbName.$tableName"
-spark.sql(s""" drop table if exists $fullTable """)
-
-spark.sql(s"""
-CREATE TABLE $fullTable (
- id INT,
- batch INT,
- data STRING
-)
-USING iceberg
-TBLPROPERTIES (
- 'format-version' = '2',
- 'write.delete.mode' = 'merge-on-read',
- 'write.update.mode' = 'merge-on-read',
- 'write.merge.mode' = 'merge-on-read'
-)
-""")
-
-batch1.writeTo(fullTable).append()
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE batch = 1 AND id IN (3, 4, 5)
-""")
-
-spark.sql(s"""
-ALTER TABLE $fullTable
-SET TBLPROPERTIES ('format-version' = '3')
-""")
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id % 2 = 1
-""")
-
-
-// spark.sql(s""" select * from $fullTable order by id """).show()
-
-
-val tableName = "dv_test_1w"
-val fullTable = s"$dbName.$tableName"
-spark.sql(s""" drop table if exists $fullTable """)
-
-spark.sql(s"""
-CREATE TABLE $fullTable (
- id BIGINT,
- grp INT,
- value INT,
- ts TIMESTAMP
-)
-USING iceberg
-TBLPROPERTIES (
- 'format-version'='3',
- 'write.delete.mode'='merge-on-read',
- 'write.update.mode'='merge-on-read',
- 'write.merge.mode'='merge-on-read',
- 'write.parquet.row-group-size-bytes'='10240'
-)
-""")
-
-import org.apache.spark.sql.functions._
-
-val df = spark.range(0, 100000).select(
- col("id"),
- (col("id") % 100).cast("int").as("grp"),
- (rand() * 1000).cast("int").as("value"),
- current_timestamp().as("ts")
- )
-
-df.repartition(10).writeTo(fullTable).append()
-
-
-spark.conf.set("spark.sql.shuffle.partitions", "1")
-spark.conf.set("spark.sql.adaptive.enabled", "false")
-
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id%2 = 1
-""")
-
-spark.sql(s"""
-DELETE FROM $fullTable
-WHERE id%3 = 1
-""")
-
-// spark.sql(s""" select count(*) from $fullTable """).show()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]