This is an automated email from the ASF dual-hosted git repository.

yuqi4733 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino-playground.git


The following commit(s) were added to refs/heads/main by this push:
     new 2c184c7  [#34] add Gravitino Spark connector to playground (#45)
2c184c7 is described below

commit 2c184c71263041350548410d9806bdd813e10b0f
Author: FANNG <[email protected]>
AuthorDate: Thu Aug 29 14:44:21 2024 +0800

    [#34] add Gravitino Spark connector to playground (#45)
    
    add spark connector to playground
    
    fixes: #34
---
 README.md                            |  60 ++++++++++++++++++---
 docker-compose.yaml                  |   4 +-
 init/common/init_metalake_catalog.sh |  90 +++++++++++++++++++++++++++++++
 init/gravitino/gravitino.conf        |   1 +
 init/hive/init.sh                    |   1 +
 init/spark/init.sh                   |   5 +-
 init/spark/spark-defaults.conf       |  11 ++--
 init/trino/init.sh                   | 101 +----------------------------------
 8 files changed, 162 insertions(+), 111 deletions(-)

diff --git a/README.md b/README.md
index 722abab..033c7a6 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ docker exec -it playground-trino bash
 trino@container_id:/$ trino
 ```
 
-### Using Jupiter Notebook
+## Using Jupyter Notebook
 
 1. Open the Jupyter Notebook in the browser at 
[http://localhost:18888](http://localhost:18888).
 
@@ -83,9 +83,23 @@ trino@container_id:/$ trino
 
 3. Start the notebook and run the cells.
 
+## Using Spark client
+
+1. Log in to the Gravitino playground Spark Docker container using the 
following command:
+
+```shell
+docker exec -it playground-spark bash
+````
+
+2. Open the Spark SQL client in the container.
+
+```shell
+spark@container_id:/$ cd /opt/spark && /bin/bash bin/spark-sql 
+```
+
 ## Example
 
-### Simple queries
+### Simple Trino queries
 
 You can use simple queries to test in the Trino CLI.
 
@@ -156,6 +170,38 @@ WHERE e.employee_id = p.employee_id AND p.employee_id = 
s.employee_id
 GROUP BY e.employee_id,  given_name, family_name;
 ```
 
+### Using Spark and Trino
+
+You might consider generating data with SparkSQL and then querying this data 
using Trino. Give it a try with Gravitino:
+
+1. login Spark container and execute the SQLs:
+
+```sql
+// using Hive catalog to create Hive table
+USE catalog_hive;
+CREATE DATABASE product;
+USE product;
+
+CREATE TABLE IF NOT EXISTS employees (
+    id INT,
+    name STRING,
+    age INT
+)
+PARTITIONED BY (department STRING)
+STORED AS PARQUET;
+DESC TABLE EXTENDED employees;
+
+INSERT OVERWRITE TABLE employees PARTITION(department='Engineering') VALUES 
(1, 'John Doe', 30), (2, 'Jane Smith', 28);
+INSERT OVERWRITE TABLE employees PARTITION(department='Marketing') VALUES (3, 
'Mike Brown', 32);
+```
+
+2. login Trino container and execute SQLs:
+
+```sql
+SELECT * FROM catalog_hive.product.employees WHERE department = 'Engineering';
+```
+
+
 ### Using Apache Iceberg REST service
 
 If you want to migrate your business from Hive to Iceberg. Some tables will 
use Hive, and the other tables will use Iceberg.
@@ -166,12 +212,14 @@ Then, you can use Trino to read the data from the Hive 
table joining the Iceberg
 
 ```text
 spark.sql.extensions 
org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-spark.sql.catalog.catalog_iceberg org.apache.iceberg.spark.SparkCatalog
-spark.sql.catalog.catalog_iceberg.type rest
-spark.sql.catalog.catalog_iceberg.uri http://gravitino:9001/iceberg/
+spark.sql.catalog.catalog_rest org.apache.iceberg.spark.SparkCatalog
+spark.sql.catalog.catalog_rest.type rest
+spark.sql.catalog.catalog_rest.uri http://gravitino:9001/iceberg/
 spark.locality.wait.node 0
 ```
 
+Please note that `catalog_rest` in SparkSQL and `catalog_iceberg` in Gravitino 
and Trino share the same Iceberg JDBC backend, which implies that they can 
access the same dataset.
+
 1. Login Spark container and execute the steps.
 
 ```shell
@@ -183,7 +231,7 @@ spark@container_id:/$ cd /opt/spark && /bin/bash 
bin/spark-sql
 ```
 
 ```SQL
-use catalog_iceberg;
+use catalog_rest;
 create database sales;
 use sales;
 create table customers (customer_id int, customer_name varchar(100), 
customer_email varchar(100));
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 8ac75e5..e9974f2 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -82,6 +82,7 @@ services:
     entrypoint:  /bin/bash /tmp/trino/init.sh
     volumes:
       - ./init/trino:/tmp/trino
+      - ./init/common:/tmp/common
     depends_on:
       hive :
         condition: service_healthy
@@ -150,6 +151,7 @@ services:
       - "14040:4040"
     volumes:
       - ./init/spark:/tmp/spark
+      - ./init/common:/tmp/common
     deploy:
       resources:
         limits:
@@ -173,4 +175,4 @@ services:
       resources:
         limits:
           cpus: "0.5"
-          memory: 500M
\ No newline at end of file
+          memory: 500M
diff --git a/init/common/init_metalake_catalog.sh 
b/init/common/init_metalake_catalog.sh
new file mode 100644
index 0000000..8a4a7be
--- /dev/null
+++ b/init/common/init_metalake_catalog.sh
@@ -0,0 +1,90 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+response=$(curl http://gravitino:8090/api/metalakes/metalake_demo)
+if echo "$response" | grep -q "\"code\":0"; then
+  true
+else
+  response=$(curl -X POST -H "Content-Type: application/json" -d 
'{"name":"metalake_demo","comment":"comment","properties":{}}' 
http://gravitino:8090/api/metalakes)
+  if echo "$response" | grep -q "\"code\":0"; then
+    true # Placeholder, do nothing
+  else
+    echo "Metalake metalake_demo create failed"
+    exit 1
+  fi
+fi
+
+
+response=$(curl 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_hive)
+if echo "$response" | grep -q "\"code\":0"; then
+  true
+else
+  # Create Hive catalog for experience Gravitino service
+  response=$(curl -X POST -H "Content-Type: application/json" -d 
'{"name":"catalog_hive","type":"RELATIONAL", "provider":"hive", 
"comment":"comment","properties":{"metastore.uris":"thrift://hive:9083", 
"spark.bypass.spark.sql.hive.metastore.jars":"path", 
"spark.bypass.spark.sql.hive.metastore.jars.path":"file:///opt/spark/jars/*" 
}}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+  if echo "$response" | grep -q "\"code\":0"; then
+    true # Placeholder, do nothing
+  else
+    echo "catalog_hive create failed"
+    exit 1
+  fi
+fi
+
+response=$(curl 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_postgres)
+if echo "$response" | grep -q "\"code\":0"; then
+  true
+else
+  # Create Postgresql catalog for experience Gravitino service
+  response=$(curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H 
"Content-Type: application/json" -d '{ "name":"catalog_postgres", 
"type":"RELATIONAL", "provider":"jdbc-postgresql", "comment":"comment", 
"properties":{ "jdbc-url":"jdbc:postgresql://postgresql/db", 
"jdbc-user":"postgres", "jdbc-password":"postgres", "jdbc-database":"db", 
"jdbc-driver": "org.postgresql.Driver" } }' 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+  if echo "$response" | grep -q "\"code\":0"; then
+    true # Placeholder, do nothing
+  else
+    echo "catalog_postgres create failed"
+    exit 1
+  fi
+fi
+
+response=$(curl 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_mysql)
+if echo "$response" | grep -q "\"code\":0"; then
+  true
+else
+  # Create Mysql catalog for experience Gravitino service
+  response=$(curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H 
"Content-Type: application/json" -d '{ "name":"catalog_mysql", 
"type":"RELATIONAL", "provider":"jdbc-mysql", "comment":"comment", 
"properties":{ "jdbc-url":"jdbc:mysql://mysql:3306", "jdbc-user":"mysql", 
"jdbc-password":"mysql", "jdbc-driver": "com.mysql.cj.jdbc.Driver" } }' 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+  
+  if echo "$response" | grep -q "catalog_mysql"; then
+    true # Placeholder, do nothing
+  else
+    echo "Catalog catalog_mysql create failed"
+    exit 1
+  fi
+fi
+
+response=$(curl 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_iceberg)
+if echo "$response" | grep -q "\"code\":0"; then
+  true
+else
+  # Create Iceberg catalog for experience Gravitino service
+  response=$(curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H 
"Content-Type: application/json" -d '{ "name":"catalog_iceberg", 
"type":"RELATIONAL", "provider":"lakehouse-iceberg", "comment":"comment", 
"properties":{ "uri":"jdbc:mysql://mysql:3306/db", "catalog-backend":"jdbc", 
"warehouse":"hdfs://hive:9000/user/iceberg/warehouse/", "jdbc-user":"mysql", 
"jdbc-password":"mysql", "jdbc-driver":"com.mysql.cj.jdbc.Driver"} }' 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+  if echo "$response" | grep -q "\"code\":0"; then
+    true # Placeholder, do nothing
+  else
+    echo "create catalog_iceberg failed"
+    exit 1
+  fi
+fi
+
diff --git a/init/gravitino/gravitino.conf b/init/gravitino/gravitino.conf
index 1adc75f..b30c9ad 100755
--- a/init/gravitino/gravitino.conf
+++ b/init/gravitino/gravitino.conf
@@ -75,3 +75,4 @@ gravitino.auxService.iceberg-rest.warehouse = 
hdfs://hive:9000/user/iceberg/ware
 gravitino.auxService.iceberg-rest.jdbc.user = mysql
 gravitino.auxService.iceberg-rest.jdbc.password = mysql
 gravitino.auxService.iceberg-rest.jdbc-driver = com.mysql.cj.jdbc.Driver
+gravitino.auxService.iceberg-rest.catalog-backend-name = catalog_iceberg
diff --git a/init/hive/init.sh b/init/hive/init.sh
index 859f493..aea00c5 100644
--- a/init/hive/init.sh
+++ b/init/hive/init.sh
@@ -19,6 +19,7 @@
 
 sed -i '$d' /usr/local/sbin/start.sh
 sed -i '$d' /usr/local/sbin/start.sh
+sed -i 's|hdfs://localhost:9000|hdfs://hive:9000|g' 
/usr/local/hive/conf/hive-site.xml
 /bin/bash /usr/local/sbin/start.sh
 hdfs dfs -mkdir /user/iceberg/
 hdfs dfs -mkdir /user/iceberg/warehouse
diff --git a/init/spark/init.sh b/init/spark/init.sh
index 603d9f0..aec0e97 100644
--- a/init/spark/init.sh
+++ b/init/spark/init.sh
@@ -18,5 +18,8 @@
 #
 mkdir -p /opt/spark/conf
 cp /tmp/spark/spark-defaults.conf /opt/spark/conf
-wget 
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.3.1/iceberg-spark-runtime-3.4_2.12-1.3.1.jar
  -O /opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.3.1.jar
+wget 
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.5.2/iceberg-spark-runtime-3.4_2.12-1.5.2.jar
  -O /opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.5.2.jar
+wget 
https://repository.apache.org/content/repositories/orgapachegravitino-1090/org/apache/gravitino/gravitino-spark-connector-runtime-3.4_2.12/0.6.0-incubating/gravitino-spark-connector-runtime-3.4_2.12-0.6.0-incubating.jar
 -O 
/opt/spark/jars/gravitino-spark-connector-runtime-3.4_2.12-0.6.0-incubating.jar
+wget 
https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.27/mysql-connector-java-8.0.27.jar
 -O /opt/spark/jars/mysql-connector-java-8.0.27.jar
+sh  /tmp/common/init_metalake_catalog.sh
 tail -f /dev/null
diff --git a/init/spark/spark-defaults.conf b/init/spark/spark-defaults.conf
index bca1d7b..c6c72b4 100644
--- a/init/spark/spark-defaults.conf
+++ b/init/spark/spark-defaults.conf
@@ -17,8 +17,13 @@
 # under the License.
 #
 
+spark.plugins org.apache.gravitino.spark.connector.plugin.GravitinoSparkPlugin
+spark.sql.gravitino.uri http://gravitino:8090
+spark.sql.gravitino.metalake metalake_demo
+spark.sql.gravitino.enableIcebergSupport true 
 spark.sql.extensions 
org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-spark.sql.catalog.catalog_iceberg org.apache.iceberg.spark.SparkCatalog
-spark.sql.catalog.catalog_iceberg.type rest
-spark.sql.catalog.catalog_iceberg.uri http://gravitino:9001/iceberg/
+spark.sql.catalog.catalog_rest org.apache.iceberg.spark.SparkCatalog
+spark.sql.catalog.catalog_rest.type rest
+spark.sql.catalog.catalog_rest.uri http://gravitino:9001/iceberg/
 spark.locality.wait.node 0
+spark.sql.warehouse.dir hdfs://hive:9000/user/hive/warehouse
diff --git a/init/trino/init.sh b/init/trino/init.sh
index 70d6199..21ba967 100644
--- a/init/trino/init.sh
+++ b/init/trino/init.sh
@@ -17,106 +17,7 @@
 # under the License.
 #
 
-# Since trino-connector needs to connect Gravitino service, get the default 
metalake
-# Create metalake
-response=$(curl -X POST -H "Content-Type: application/json" -d 
'{"name":"metalake_demo","comment":"comment","properties":{}}' 
http://gravitino:8090/api/metalakes)
-if echo "$response" | grep -q "\"code\":0"; then
-  true # Placeholder, do nothing
-else
-  echo "Metalake metalake_demo create failed"
-  exit 1
-fi
-
-# Check metalake if created
-response=$(curl -X GET -H "Content-Type: application/json" 
http://gravitino:8090/api/metalakes)
-if echo "$response" | grep -q "metalake_demo"; then
-  echo "Metalake metalake_demo successfully created"
-else
-  echo "Metalake metalake_demo create failed"
-  exit 1
-fi
-
-# Create Hive catalog for experience Gravitino service
-curl -X POST -H "Content-Type: application/json" -d 
'{"name":"catalog_hive","type":"RELATIONAL", "provider":"hive", 
"comment":"comment","properties":{"metastore.uris":"thrift://hive:9083"}}' 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-
-# Check catalog if created
-response=$(curl -X GET -H "Content-Type: application/json" 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_hive"; then
-  echo "Catalog catalog_hive successfully created"
-else
-  echo "Catalog catalog_hive create failed"
-  exit 1
-fi
-
-# Create Postgresql catalog for experience Gravitino service
-curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type: 
application/json" -d '{
-    "name":"catalog_postgres",
-    "type":"RELATIONAL",
-    "provider":"jdbc-postgresql",
-    "comment":"comment",
-    "properties":{
-        "jdbc-url":"jdbc:postgresql://postgresql/db",
-        "jdbc-user":"postgres",
-        "jdbc-password":"postgres",
-        "jdbc-database":"db",
-        "jdbc-driver": "org.postgresql.Driver"
-    }
-}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-response=$(curl -X GET -H "Content-Type: application/json" 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_postgres"; then
-  echo "Catalog catalog_postgres successfully created"
-else
-  echo "Catalog catalog_postgres create failed"
-  exit 1
-fi
-
-# Create Postgresql catalog for experience Gravitino service
-curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type: 
application/json" -d '{
-    "name":"catalog_mysql",
-    "type":"RELATIONAL",
-    "provider":"jdbc-mysql",
-    "comment":"comment",
-    "properties":{
-        "jdbc-url":"jdbc:mysql://mysql:3306",
-        "jdbc-user":"mysql",
-        "jdbc-password":"mysql",
-        "jdbc-driver": "com.mysql.cj.jdbc.Driver"
-    }
-}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-response=$(curl -X GET -H "Content-Type: application/json" 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_mysql"; then
-  echo "Catalog catalog_mysql successfully created"
-else
-  echo "Catalog catalog_mysql create failed"
-  exit 1
-fi
-
-# Create Iceberg catalog for experience Gravitino service
-curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type: 
application/json" -d '{
-    "name":"catalog_iceberg",
-    "type":"RELATIONAL",
-    "provider":"lakehouse-iceberg",
-    "comment":"comment",
-    "properties":{
-        "uri":"jdbc:mysql://mysql:3306/db",
-        "catalog-backend":"jdbc",
-        "warehouse":"hdfs://hive:9000/user/iceberg/warehouse/",
-        "jdbc-user":"mysql",
-        "jdbc-password":"mysql",
-        "jdbc-driver":"com.mysql.cj.jdbc.Driver"
-    }
-}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-response=$(curl -X GET -H "Content-Type: application/json" 
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_iceberg"; then
-  echo "Catalog catalog_iceberg successfully created"
-else
-  echo "Catalog catalog_iceberg create failed"
-  exit 1
-fi
+sh  /tmp/common/init_metalake_catalog.sh
 
 /etc/trino/update-trino-conf.sh
 nohup /usr/lib/trino/bin/run-trino &

Reply via email to