This is an automated email from the ASF dual-hosted git repository.
fanng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino-playground.git
The following commit(s) were added to refs/heads/main by this push:
new d64b86a Optimize the Spark Jupyter (#88)
d64b86a is described below
commit d64b86a51fa919dbe25e906958833da9ef231455
Author: roryqi <[email protected]>
AuthorDate: Mon Oct 28 15:07:54 2024 +0800
Optimize the Spark Jupyter (#88)
I forgot upgrade the version of Spark Jupyter jar to 0.6.1
I optimize the usage of Spark jars.
---
docker-compose.yaml | 2 --
init/common/init_metalake_catalog.sh | 2 +-
init/jupyter/gravitino-spark-trino-example.ipynb | 2 +-
.../jupyter-dependency.sh} | 25 +++++++++++++---------
init/spark/spark-defaults.conf | 2 ++
playground.sh | 1 +
6 files changed, 20 insertions(+), 14 deletions(-)
diff --git a/docker-compose.yaml b/docker-compose.yaml
index a4f2964..1decede 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -136,7 +136,6 @@ services:
volumes:
- ./init/spark:/tmp/spark
- ./init/common:/tmp/common
- - spark_jars:/opt/spark/jars
jupyter:
image: jupyter/pyspark-notebook:spark-3.4.1
@@ -145,7 +144,6 @@ services:
- 18888:8888
volumes:
- ./init/jupyter:/tmp/gravitino
- - spark_jars:/opt/spark/jars
entrypoint: /bin/bash /tmp/gravitino/init.sh
depends_on:
hive :
diff --git a/init/common/init_metalake_catalog.sh
b/init/common/init_metalake_catalog.sh
index 9b2bed4..5da3246 100644
--- a/init/common/init_metalake_catalog.sh
+++ b/init/common/init_metalake_catalog.sh
@@ -35,7 +35,7 @@ if echo "$response" | grep -q "\"code\":0"; then
true
else
# Create Hive catalog for experience Gravitino service
- response=$(curl -X POST -H "Content-Type: application/json" -d
'{"name":"catalog_hive","type":"RELATIONAL", "provider":"hive",
"comment":"comment","properties":{"metastore.uris":"thrift://hive:9083",
"spark.bypass.spark.sql.hive.metastore.jars":"path",
"spark.bypass.spark.sql.hive.metastore.jars.path":"file:///opt/spark/jars/*"
}}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+ response=$(curl -X POST -H "Content-Type: application/json" -d
'{"name":"catalog_hive","type":"RELATIONAL", "provider":"hive",
"comment":"comment","properties":{"metastore.uris":"thrift://hive:9083"}}'
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
if echo "$response" | grep -q "\"code\":0"; then
true # Placeholder, do nothing
else
diff --git a/init/jupyter/gravitino-spark-trino-example.ipynb
b/init/jupyter/gravitino-spark-trino-example.ipynb
index eaf14fe..48ea713 100644
--- a/init/jupyter/gravitino-spark-trino-example.ipynb
+++ b/init/jupyter/gravitino-spark-trino-example.ipynb
@@ -21,7 +21,7 @@
"spark = SparkSession.builder \\\n",
" .appName(\"PySpark SQL Example\") \\\n",
" .config(\"spark.plugins\",
\"org.apache.gravitino.spark.connector.plugin.GravitinoSparkPlugin\") \\\n",
- " .config(\"spark.jars\",
\"/opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.5.2.jar,/opt/spark/jars/gravitino-spark-connector-runtime-3.4_2.12-0.6.1-incubating.jar\")
\\\n",
+ " .config(\"spark.jars\",
\"/tmp/gravitino/packages/iceberg-spark-runtime-3.4_2.12-1.5.2.jar,/tmp/gravitino/packages/gravitino-spark-connector-runtime-3.4_2.12-0.6.1-incubating.jar\")
\\\n",
" .config(\"spark.sql.gravitino.uri\", \"http://gravitino:8090\") \\\n",
" .config(\"spark.sql.gravitino.metalake\", \"metalake_demo\") \\\n",
" .config(\"spark.sql.gravitino.enableIcebergSupport\", \"true\") \\\n",
diff --git a/init/spark/spark-defaults.conf b/init/jupyter/jupyter-dependency.sh
old mode 100644
new mode 100755
similarity index 58%
copy from init/spark/spark-defaults.conf
copy to init/jupyter/jupyter-dependency.sh
index c6c72b4..3ddb748
--- a/init/spark/spark-defaults.conf
+++ b/init/jupyter/jupyter-dependency.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -17,13 +19,16 @@
# under the License.
#
-spark.plugins org.apache.gravitino.spark.connector.plugin.GravitinoSparkPlugin
-spark.sql.gravitino.uri http://gravitino:8090
-spark.sql.gravitino.metalake metalake_demo
-spark.sql.gravitino.enableIcebergSupport true
-spark.sql.extensions
org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-spark.sql.catalog.catalog_rest org.apache.iceberg.spark.SparkCatalog
-spark.sql.catalog.catalog_rest.type rest
-spark.sql.catalog.catalog_rest.uri http://gravitino:9001/iceberg/
-spark.locality.wait.node 0
-spark.sql.warehouse.dir hdfs://hive:9000/user/hive/warehouse
+jupyter_dir="$(dirname "${BASH_SOURCE-$0}")"
+jupyter_dir="$(
+ cd "${jupyter_dir}" >/dev/null
+ pwd
+)"
+. "${jupyter_dir}/../common/common.sh"
+
+# Prepare download packages
+if [[ ! -d "${jupyter_dir}/packages" ]]; then
+ mkdir "${jupyter_dir}/packages"
+ find "${jupyter_dir}/../spark/packages/" | grep jar | xargs -I {} ln {}
"${jupyter_dir}/packages/"
+fi
+
diff --git a/init/spark/spark-defaults.conf b/init/spark/spark-defaults.conf
index c6c72b4..446f865 100644
--- a/init/spark/spark-defaults.conf
+++ b/init/spark/spark-defaults.conf
@@ -27,3 +27,5 @@ spark.sql.catalog.catalog_rest.type rest
spark.sql.catalog.catalog_rest.uri http://gravitino:9001/iceberg/
spark.locality.wait.node 0
spark.sql.warehouse.dir hdfs://hive:9000/user/hive/warehouse
+spark.sql.hive.metastore.jars path
+spark.sql.hive.metastore.jars.path file:///opt/spark/jars/*
diff --git a/playground.sh b/playground.sh
index 2144453..abd4777 100755
--- a/playground.sh
+++ b/playground.sh
@@ -73,6 +73,7 @@ start() {
echo "Preparing packages..."
./init/spark/spark-dependency.sh
./init/gravitino/gravitino-dependency.sh
+ ./init/jupyter/jupyter-dependency.sh
logSuffix=$(date +%Y%m%d%H%m%s)
docker-compose up --detach