This is an automated email from the ASF dual-hosted git repository.
yuqi4733 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino-playground.git
The following commit(s) were added to refs/heads/main by this push:
new 2c184c7 [#34] add Gravitino Spark connector to playground (#45)
2c184c7 is described below
commit 2c184c71263041350548410d9806bdd813e10b0f
Author: FANNG <[email protected]>
AuthorDate: Thu Aug 29 14:44:21 2024 +0800
[#34] add Gravitino Spark connector to playground (#45)
add spark connector to playground
fixes: #34
---
README.md | 60 ++++++++++++++++++---
docker-compose.yaml | 4 +-
init/common/init_metalake_catalog.sh | 90 +++++++++++++++++++++++++++++++
init/gravitino/gravitino.conf | 1 +
init/hive/init.sh | 1 +
init/spark/init.sh | 5 +-
init/spark/spark-defaults.conf | 11 ++--
init/trino/init.sh | 101 +----------------------------------
8 files changed, 162 insertions(+), 111 deletions(-)
diff --git a/README.md b/README.md
index 722abab..033c7a6 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ docker exec -it playground-trino bash
trino@container_id:/$ trino
```
-### Using Jupiter Notebook
+## Using Jupyter Notebook
1. Open the Jupyter Notebook in the browser at
[http://localhost:18888](http://localhost:18888).
@@ -83,9 +83,23 @@ trino@container_id:/$ trino
3. Start the notebook and run the cells.
+## Using Spark client
+
+1. Log in to the Gravitino playground Spark Docker container using the
following command:
+
+```shell
+docker exec -it playground-spark bash
+````
+
+2. Open the Spark SQL client in the container.
+
+```shell
+spark@container_id:/$ cd /opt/spark && /bin/bash bin/spark-sql
+```
+
## Example
-### Simple queries
+### Simple Trino queries
You can use simple queries to test in the Trino CLI.
@@ -156,6 +170,38 @@ WHERE e.employee_id = p.employee_id AND p.employee_id =
s.employee_id
GROUP BY e.employee_id, given_name, family_name;
```
+### Using Spark and Trino
+
+You might consider generating data with SparkSQL and then querying this data
using Trino. Give it a try with Gravitino:
+
+1. login Spark container and execute the SQLs:
+
+```sql
+// using Hive catalog to create Hive table
+USE catalog_hive;
+CREATE DATABASE product;
+USE product;
+
+CREATE TABLE IF NOT EXISTS employees (
+ id INT,
+ name STRING,
+ age INT
+)
+PARTITIONED BY (department STRING)
+STORED AS PARQUET;
+DESC TABLE EXTENDED employees;
+
+INSERT OVERWRITE TABLE employees PARTITION(department='Engineering') VALUES
(1, 'John Doe', 30), (2, 'Jane Smith', 28);
+INSERT OVERWRITE TABLE employees PARTITION(department='Marketing') VALUES (3,
'Mike Brown', 32);
+```
+
+2. login Trino container and execute SQLs:
+
+```sql
+SELECT * FROM catalog_hive.product.employees WHERE department = 'Engineering';
+```
+
+
### Using Apache Iceberg REST service
If you want to migrate your business from Hive to Iceberg. Some tables will
use Hive, and the other tables will use Iceberg.
@@ -166,12 +212,14 @@ Then, you can use Trino to read the data from the Hive
table joining the Iceberg
```text
spark.sql.extensions
org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-spark.sql.catalog.catalog_iceberg org.apache.iceberg.spark.SparkCatalog
-spark.sql.catalog.catalog_iceberg.type rest
-spark.sql.catalog.catalog_iceberg.uri http://gravitino:9001/iceberg/
+spark.sql.catalog.catalog_rest org.apache.iceberg.spark.SparkCatalog
+spark.sql.catalog.catalog_rest.type rest
+spark.sql.catalog.catalog_rest.uri http://gravitino:9001/iceberg/
spark.locality.wait.node 0
```
+Please note that `catalog_rest` in SparkSQL and `catalog_iceberg` in Gravitino
and Trino share the same Iceberg JDBC backend, which implies that they can
access the same dataset.
+
1. Login Spark container and execute the steps.
```shell
@@ -183,7 +231,7 @@ spark@container_id:/$ cd /opt/spark && /bin/bash
bin/spark-sql
```
```SQL
-use catalog_iceberg;
+use catalog_rest;
create database sales;
use sales;
create table customers (customer_id int, customer_name varchar(100),
customer_email varchar(100));
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 8ac75e5..e9974f2 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -82,6 +82,7 @@ services:
entrypoint: /bin/bash /tmp/trino/init.sh
volumes:
- ./init/trino:/tmp/trino
+ - ./init/common:/tmp/common
depends_on:
hive :
condition: service_healthy
@@ -150,6 +151,7 @@ services:
- "14040:4040"
volumes:
- ./init/spark:/tmp/spark
+ - ./init/common:/tmp/common
deploy:
resources:
limits:
@@ -173,4 +175,4 @@ services:
resources:
limits:
cpus: "0.5"
- memory: 500M
\ No newline at end of file
+ memory: 500M
diff --git a/init/common/init_metalake_catalog.sh
b/init/common/init_metalake_catalog.sh
new file mode 100644
index 0000000..8a4a7be
--- /dev/null
+++ b/init/common/init_metalake_catalog.sh
@@ -0,0 +1,90 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+response=$(curl http://gravitino:8090/api/metalakes/metalake_demo)
+if echo "$response" | grep -q "\"code\":0"; then
+ true
+else
+ response=$(curl -X POST -H "Content-Type: application/json" -d
'{"name":"metalake_demo","comment":"comment","properties":{}}'
http://gravitino:8090/api/metalakes)
+ if echo "$response" | grep -q "\"code\":0"; then
+ true # Placeholder, do nothing
+ else
+ echo "Metalake metalake_demo create failed"
+ exit 1
+ fi
+fi
+
+
+response=$(curl
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_hive)
+if echo "$response" | grep -q "\"code\":0"; then
+ true
+else
+ # Create Hive catalog for experience Gravitino service
+ response=$(curl -X POST -H "Content-Type: application/json" -d
'{"name":"catalog_hive","type":"RELATIONAL", "provider":"hive",
"comment":"comment","properties":{"metastore.uris":"thrift://hive:9083",
"spark.bypass.spark.sql.hive.metastore.jars":"path",
"spark.bypass.spark.sql.hive.metastore.jars.path":"file:///opt/spark/jars/*"
}}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+ if echo "$response" | grep -q "\"code\":0"; then
+ true # Placeholder, do nothing
+ else
+ echo "catalog_hive create failed"
+ exit 1
+ fi
+fi
+
+response=$(curl
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_postgres)
+if echo "$response" | grep -q "\"code\":0"; then
+ true
+else
+ # Create Postgresql catalog for experience Gravitino service
+ response=$(curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H
"Content-Type: application/json" -d '{ "name":"catalog_postgres",
"type":"RELATIONAL", "provider":"jdbc-postgresql", "comment":"comment",
"properties":{ "jdbc-url":"jdbc:postgresql://postgresql/db",
"jdbc-user":"postgres", "jdbc-password":"postgres", "jdbc-database":"db",
"jdbc-driver": "org.postgresql.Driver" } }'
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+ if echo "$response" | grep -q "\"code\":0"; then
+ true # Placeholder, do nothing
+ else
+ echo "catalog_postgres create failed"
+ exit 1
+ fi
+fi
+
+response=$(curl
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_mysql)
+if echo "$response" | grep -q "\"code\":0"; then
+ true
+else
+ # Create Mysql catalog for experience Gravitino service
+ response=$(curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H
"Content-Type: application/json" -d '{ "name":"catalog_mysql",
"type":"RELATIONAL", "provider":"jdbc-mysql", "comment":"comment",
"properties":{ "jdbc-url":"jdbc:mysql://mysql:3306", "jdbc-user":"mysql",
"jdbc-password":"mysql", "jdbc-driver": "com.mysql.cj.jdbc.Driver" } }'
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+
+ if echo "$response" | grep -q "catalog_mysql"; then
+ true # Placeholder, do nothing
+ else
+ echo "Catalog catalog_mysql create failed"
+ exit 1
+ fi
+fi
+
+response=$(curl
http://gravitino:8090/api/metalakes/metalake_demo/catalogs/catalog_iceberg)
+if echo "$response" | grep -q "\"code\":0"; then
+ true
+else
+ # Create Iceberg catalog for experience Gravitino service
+ response=$(curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H
"Content-Type: application/json" -d '{ "name":"catalog_iceberg",
"type":"RELATIONAL", "provider":"lakehouse-iceberg", "comment":"comment",
"properties":{ "uri":"jdbc:mysql://mysql:3306/db", "catalog-backend":"jdbc",
"warehouse":"hdfs://hive:9000/user/iceberg/warehouse/", "jdbc-user":"mysql",
"jdbc-password":"mysql", "jdbc-driver":"com.mysql.cj.jdbc.Driver"} }'
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+ if echo "$response" | grep -q "\"code\":0"; then
+ true # Placeholder, do nothing
+ else
+ echo "create catalog_iceberg failed"
+ exit 1
+ fi
+fi
+
diff --git a/init/gravitino/gravitino.conf b/init/gravitino/gravitino.conf
index 1adc75f..b30c9ad 100755
--- a/init/gravitino/gravitino.conf
+++ b/init/gravitino/gravitino.conf
@@ -75,3 +75,4 @@ gravitino.auxService.iceberg-rest.warehouse =
hdfs://hive:9000/user/iceberg/ware
gravitino.auxService.iceberg-rest.jdbc.user = mysql
gravitino.auxService.iceberg-rest.jdbc.password = mysql
gravitino.auxService.iceberg-rest.jdbc-driver = com.mysql.cj.jdbc.Driver
+gravitino.auxService.iceberg-rest.catalog-backend-name = catalog_iceberg
diff --git a/init/hive/init.sh b/init/hive/init.sh
index 859f493..aea00c5 100644
--- a/init/hive/init.sh
+++ b/init/hive/init.sh
@@ -19,6 +19,7 @@
sed -i '$d' /usr/local/sbin/start.sh
sed -i '$d' /usr/local/sbin/start.sh
+sed -i 's|hdfs://localhost:9000|hdfs://hive:9000|g'
/usr/local/hive/conf/hive-site.xml
/bin/bash /usr/local/sbin/start.sh
hdfs dfs -mkdir /user/iceberg/
hdfs dfs -mkdir /user/iceberg/warehouse
diff --git a/init/spark/init.sh b/init/spark/init.sh
index 603d9f0..aec0e97 100644
--- a/init/spark/init.sh
+++ b/init/spark/init.sh
@@ -18,5 +18,8 @@
#
mkdir -p /opt/spark/conf
cp /tmp/spark/spark-defaults.conf /opt/spark/conf
-wget
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.3.1/iceberg-spark-runtime-3.4_2.12-1.3.1.jar
-O /opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.3.1.jar
+wget
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.5.2/iceberg-spark-runtime-3.4_2.12-1.5.2.jar
-O /opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.5.2.jar
+wget
https://repository.apache.org/content/repositories/orgapachegravitino-1090/org/apache/gravitino/gravitino-spark-connector-runtime-3.4_2.12/0.6.0-incubating/gravitino-spark-connector-runtime-3.4_2.12-0.6.0-incubating.jar
-O
/opt/spark/jars/gravitino-spark-connector-runtime-3.4_2.12-0.6.0-incubating.jar
+wget
https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.27/mysql-connector-java-8.0.27.jar
-O /opt/spark/jars/mysql-connector-java-8.0.27.jar
+sh /tmp/common/init_metalake_catalog.sh
tail -f /dev/null
diff --git a/init/spark/spark-defaults.conf b/init/spark/spark-defaults.conf
index bca1d7b..c6c72b4 100644
--- a/init/spark/spark-defaults.conf
+++ b/init/spark/spark-defaults.conf
@@ -17,8 +17,13 @@
# under the License.
#
+spark.plugins org.apache.gravitino.spark.connector.plugin.GravitinoSparkPlugin
+spark.sql.gravitino.uri http://gravitino:8090
+spark.sql.gravitino.metalake metalake_demo
+spark.sql.gravitino.enableIcebergSupport true
spark.sql.extensions
org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-spark.sql.catalog.catalog_iceberg org.apache.iceberg.spark.SparkCatalog
-spark.sql.catalog.catalog_iceberg.type rest
-spark.sql.catalog.catalog_iceberg.uri http://gravitino:9001/iceberg/
+spark.sql.catalog.catalog_rest org.apache.iceberg.spark.SparkCatalog
+spark.sql.catalog.catalog_rest.type rest
+spark.sql.catalog.catalog_rest.uri http://gravitino:9001/iceberg/
spark.locality.wait.node 0
+spark.sql.warehouse.dir hdfs://hive:9000/user/hive/warehouse
diff --git a/init/trino/init.sh b/init/trino/init.sh
index 70d6199..21ba967 100644
--- a/init/trino/init.sh
+++ b/init/trino/init.sh
@@ -17,106 +17,7 @@
# under the License.
#
-# Since trino-connector needs to connect Gravitino service, get the default
metalake
-# Create metalake
-response=$(curl -X POST -H "Content-Type: application/json" -d
'{"name":"metalake_demo","comment":"comment","properties":{}}'
http://gravitino:8090/api/metalakes)
-if echo "$response" | grep -q "\"code\":0"; then
- true # Placeholder, do nothing
-else
- echo "Metalake metalake_demo create failed"
- exit 1
-fi
-
-# Check metalake if created
-response=$(curl -X GET -H "Content-Type: application/json"
http://gravitino:8090/api/metalakes)
-if echo "$response" | grep -q "metalake_demo"; then
- echo "Metalake metalake_demo successfully created"
-else
- echo "Metalake metalake_demo create failed"
- exit 1
-fi
-
-# Create Hive catalog for experience Gravitino service
-curl -X POST -H "Content-Type: application/json" -d
'{"name":"catalog_hive","type":"RELATIONAL", "provider":"hive",
"comment":"comment","properties":{"metastore.uris":"thrift://hive:9083"}}'
http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-
-# Check catalog if created
-response=$(curl -X GET -H "Content-Type: application/json"
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_hive"; then
- echo "Catalog catalog_hive successfully created"
-else
- echo "Catalog catalog_hive create failed"
- exit 1
-fi
-
-# Create Postgresql catalog for experience Gravitino service
-curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type:
application/json" -d '{
- "name":"catalog_postgres",
- "type":"RELATIONAL",
- "provider":"jdbc-postgresql",
- "comment":"comment",
- "properties":{
- "jdbc-url":"jdbc:postgresql://postgresql/db",
- "jdbc-user":"postgres",
- "jdbc-password":"postgres",
- "jdbc-database":"db",
- "jdbc-driver": "org.postgresql.Driver"
- }
-}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-response=$(curl -X GET -H "Content-Type: application/json"
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_postgres"; then
- echo "Catalog catalog_postgres successfully created"
-else
- echo "Catalog catalog_postgres create failed"
- exit 1
-fi
-
-# Create Postgresql catalog for experience Gravitino service
-curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type:
application/json" -d '{
- "name":"catalog_mysql",
- "type":"RELATIONAL",
- "provider":"jdbc-mysql",
- "comment":"comment",
- "properties":{
- "jdbc-url":"jdbc:mysql://mysql:3306",
- "jdbc-user":"mysql",
- "jdbc-password":"mysql",
- "jdbc-driver": "com.mysql.cj.jdbc.Driver"
- }
-}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-response=$(curl -X GET -H "Content-Type: application/json"
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_mysql"; then
- echo "Catalog catalog_mysql successfully created"
-else
- echo "Catalog catalog_mysql create failed"
- exit 1
-fi
-
-# Create Iceberg catalog for experience Gravitino service
-curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type:
application/json" -d '{
- "name":"catalog_iceberg",
- "type":"RELATIONAL",
- "provider":"lakehouse-iceberg",
- "comment":"comment",
- "properties":{
- "uri":"jdbc:mysql://mysql:3306/db",
- "catalog-backend":"jdbc",
- "warehouse":"hdfs://hive:9000/user/iceberg/warehouse/",
- "jdbc-user":"mysql",
- "jdbc-password":"mysql",
- "jdbc-driver":"com.mysql.cj.jdbc.Driver"
- }
-}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs
-
-response=$(curl -X GET -H "Content-Type: application/json"
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
-if echo "$response" | grep -q "catalog_iceberg"; then
- echo "Catalog catalog_iceberg successfully created"
-else
- echo "Catalog catalog_iceberg create failed"
- exit 1
-fi
+sh /tmp/common/init_metalake_catalog.sh
/etc/trino/update-trino-conf.sh
nohup /usr/lib/trino/bin/run-trino &