This is an automated email from the ASF dual-hosted git repository.
russellspitzer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/polaris.git
The following commit(s) were added to refs/heads/main by this push:
new adc3e602 Support catalog backed by s3 in run_spark_sql.sh (#199)
adc3e602 is described below
commit adc3e602f37ebf635a268253f5c8d2639c533c67
Author: Yufei Gu <[email protected]>
AuthorDate: Tue Sep 3 07:15:42 2024 -0700
Support catalog backed by s3 in run_spark_sql.sh (#199)
Co-authored-by: Yufei Gu <yufei.apache.org>
---
regtests/run_spark_sql.sh | 97 ++++++++++++++++++++++++++++++++++-------------
1 file changed, 70 insertions(+), 27 deletions(-)
diff --git a/regtests/run_spark_sql.sh b/regtests/run_spark_sql.sh
index d1d29e76..9e1a53e2 100755
--- a/regtests/run_spark_sql.sh
+++ b/regtests/run_spark_sql.sh
@@ -16,11 +16,32 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+
+# -----------------------------------------------------------------------------
+# Purpose: Launch the Spark SQL shell to interact with Polaris.
+# -----------------------------------------------------------------------------
+#
+# Usage:
+# ./run_spark_sql.sh [S3-location AWS-IAM-role]
#
+# Description:
+# - Without arguments: Runs against a catalog backed by the local filesystem.
+# - With two arguments: Runs against a catalog backed by AWS S3.
+# - [S3-location] - The S3 path to use as the default base location for
the catalog.
+# - [AWS-IAM-role] - The AWS IAM role for catalog to assume when
accessing the S3 location.
#
-# Run this to open an interactive spark-sql shell talking to a catalog named
"manual_spark"
+# Examples:
+# - Run against local filesystem:
+# ./run_spark_sql.sh
#
-# You must run 'use polaris;' as your first query in the spark-sql shell.
+# - Run against AWS S3:
+# ./run_spark_sql.sh s3://my-bucket/path
arn:aws:iam::123456789001:role/my-role
+
+if [ $# -ne 0 ] && [ $# -ne 2 ]; then
+ echo "run_spark_sql.sh only accepts 0 or 2 arguments"
+ echo "Usage: ./run_spark_sql.sh [S3-location AWS-IAM-role]"
+ exit 1
+fi
REGTEST_HOME=$(dirname $(realpath $0))
cd ${REGTEST_HOME}
@@ -36,37 +57,60 @@ fi
SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN:-principal:root;realm:default-realm}"
-# Use local filesystem by default
-curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept:
application/json' -H 'Content-Type: application/json' \
- http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \
- -d '{
- "catalog": {
- "name": "manual_spark",
- "type": "INTERNAL",
- "readOnly": false,
- "properties": {
- "default-base-location": "file:///tmp/polaris/"
- },
- "storageConfigInfo": {
- "storageType": "FILE",
- "allowedLocations": [
- "file:///tmp"
- ]
- }
- }
- }'
+if [ $# -eq 0 ]; then
+ # create a catalog backed by the local filesystem
+ curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \
+ -H 'Accept: application/json' \
+ -H 'Content-Type: application/json' \
+ http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \
+ -d '{
+ "catalog": {
+ "name": "manual_spark",
+ "type": "INTERNAL",
+ "readOnly": false,
+ "properties": {
+ "default-base-location": "file:///tmp/polaris/"
+ },
+ "storageConfigInfo": {
+ "storageType": "FILE",
+ "allowedLocations": [
+ "file:///tmp"
+ ]
+ }
+ }
+ }'
+
+elif [ $# -eq 2 ]; then
+ # create a catalog backed by S3
+ S3_LOCATION=$1
+ AWS_IAM_ROLE=$2
-# Use the following instead of below to use s3 instead of local filesystem
-#curl -i -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept:
application/json' -H 'Content-Type: application/json' \
-# http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \
-# -d "{\"name\": \"manual_spark\", \"id\": 100, \"type\": \"INTERNAL\",
\"readOnly\": false, \"properties\": {\"default-base-location\":
\"s3://${S3_BUCKET}/${USER}/polaris/\"}}"
+ curl -i -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \
+ -H 'Accept: application/json' \
+ -H 'Content-Type: application/json' \
+ http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \
+ -d "{
+ \"name\": \"manual_spark\",
+ \"id\": 100,
+ \"type\": \"INTERNAL\",
+ \"readOnly\": false,
+ \"properties\": {
+ \"default-base-location\": \"${S3_LOCATION}\"
+ },
+ \"storageConfigInfo\": {
+ \"storageType\": \"S3\",
+ \"allowedLocations\": [\"${S3_LOCATION}/\"],
+ \"roleArn\": \"${AWS_IAM_ROLE}\"
+ }
+ }"
+fi
# Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it
can only manage access and metadata
curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept:
application/json' -H 'Content-Type: application/json' \
http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/manual_spark/catalog-roles/catalog_admin/grants
\
-d '{"type": "catalog", "privilege": "TABLE_WRITE_DATA"}' > /dev/stderr
-# For now, also explicitly assign the catalog_admin to the service_admin.
Remove once GS fully rolled out for auto-assign.
+# Assign the catalog_admin to the service_admin.
curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept:
application/json' -H 'Content-Type: application/json' \
http://${POLARIS_HOST:-localhost}:8181/api/management/v1/principal-roles/service_admin/catalog-roles/manual_spark
\
-d '{"name": "catalog_admin"}' > /dev/stderr
@@ -74,7 +118,6 @@ curl -i -X PUT -H "Authorization: Bearer
${SPARK_BEARER_TOKEN}" -H 'Accept: appl
curl -X GET -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept:
application/json' -H 'Content-Type: application/json' \
http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/manual_spark
-echo ${SPARK_HOME}/bin/spark-sql -S --conf
spark.sql.catalog.polaris.token="${SPARK_BEARER_TOKEN}"
${SPARK_HOME}/bin/spark-sql -S --conf
spark.sql.catalog.polaris.token="${SPARK_BEARER_TOKEN}" \
--conf spark.sql.catalog.polaris.warehouse=manual_spark \
--conf spark.sql.defaultCatalog=polaris \