This is an automated email from the ASF dual-hosted git repository. yufei pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/polaris.git
The following commit(s) were added to refs/heads/main by this push: new 6e45ef7e0 Spark: Add regtests for Spark client to test built jars (#1402) 6e45ef7e0 is described below commit 6e45ef7e0fd30820bd1c2f196e86d20f9ac75ba5 Author: gh-yzou <167037035+gh-y...@users.noreply.github.com> AuthorDate: Wed Apr 23 22:12:00 2025 -0700 Spark: Add regtests for Spark client to test built jars (#1402) --- .github/workflows/spark_client_regtests.yml | 62 +++++++++ build.gradle.kts | 2 + plugins/spark/v3.5/regtests/Dockerfile | 48 +++++++ plugins/spark/v3.5/regtests/README.md | 86 +++++++++++++ plugins/spark/v3.5/regtests/docker-compose.yml | 46 +++++++ plugins/spark/v3.5/regtests/run.sh | 132 +++++++++++++++++++ plugins/spark/v3.5/regtests/setup.sh | 169 +++++++++++++++++++++++++ plugins/spark/v3.5/regtests/spark_sql.ref | 57 +++++++++ plugins/spark/v3.5/regtests/spark_sql.sh | 81 ++++++++++++ plugins/spark/v3.5/spark/build.gradle.kts | 43 +++++-- 10 files changed, 718 insertions(+), 8 deletions(-) diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml new file mode 100644 index 000000000..44e0fdca1 --- /dev/null +++ b/.github/workflows/spark_client_regtests.yml @@ -0,0 +1,62 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Spark Client Regression Tests +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + regtest: + + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK 21 + uses: actions/setup-java@v4 + with: + java-version: '21' + distribution: 'temurin' + + - name: Fix permissions + run: mkdir -p regtests/output && chmod 777 regtests/output && chmod 777 regtests/t_*/ref/* + + - name: Project build without testing + run: ./gradlew assemble + + - name: Image build + run: | + ./gradlew \ + :polaris-quarkus-server:assemble \ + :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ + -Dquarkus.container-image.build=true + + # NOTE: the regression test runs with spark 3.5.5 and scala 2.12 in Java 17. We also have integration + # tests runs with the existing gradle.yml, which only runs on Java 21. Since spark Java compatibility + # for 3.5 is 8, 11, and 17, we should run spark client with those compatible java versions. + # TODO: add separate spark client CI and run with Java 8, 11 and 17. + - name: Regression Test + run: | + docker compose -f plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest diff --git a/build.gradle.kts b/build.gradle.kts index e39abe385..feb6e368c 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -125,6 +125,8 @@ tasks.named<RatTask>("rat").configure { excludes.add("**/kotlin-compiler*") excludes.add("**/build-logic/.kotlin/**") + + excludes.add("plugins/**/*.ref") } // Pass environment variables: diff --git a/plugins/spark/v3.5/regtests/Dockerfile b/plugins/spark/v3.5/regtests/Dockerfile new file mode 100755 index 000000000..1620c12ae --- /dev/null +++ b/plugins/spark/v3.5/regtests/Dockerfile @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +FROM docker.io/apache/spark:3.5.5-java17 +ARG POLARIS_HOST=polaris +ENV POLARIS_HOST=$POLARIS_HOST +ENV SPARK_HOME=/opt/spark +ENV CURRENT_SCALA_VERSION='2.12' +ENV LANGUAGE='en_US:en' + +USER root +RUN apt update +RUN apt-get install -y diffutils wget curl +RUN mkdir -p /home/spark && \ + chown -R spark /home/spark && \ + mkdir -p /tmp/polaris-regtests && \ + chown -R spark /tmp/polaris-regtests +RUN mkdir /opt/spark/conf && chmod -R 777 /opt/spark/conf + +USER spark + +WORKDIR /home/spark/polaris + +COPY --chown=spark ./v3.5 /home/spark/polaris/v3.5 + +# /home/spark/regtests might not be writable in all situations, see https://github.com/apache/polaris/pull/205 +USER root +RUN chmod -R go+rwx /home/spark/polaris +RUN chmod -R 777 ./v3.5/regtests +USER spark + +ENTRYPOINT ["./v3.5/regtests/run.sh"] diff --git a/plugins/spark/v3.5/regtests/README.md b/plugins/spark/v3.5/regtests/README.md new file mode 100755 index 000000000..75dd57a5a --- /dev/null +++ b/plugins/spark/v3.5/regtests/README.md @@ -0,0 +1,86 @@ +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> + +# End-to-end regression tests + +regtests provides basic end-to-end tests for spark_sql using spark client jars. + +Regression tests are either run in Docker, using docker-compose to orchestrate the tests, or +locally. + +**NOTE** regtests are supposed to be a light-weight testing to ensure jars can be used to start +spark and run basic SQL commands. Please use integration for detailed testing. + +## Prerequisites + +It is recommended to clean the `regtests/output` directory before running tests. This can be done by +running: + +```shell +rm -rf ./plugins/spark/v3.5/regtests/output && mkdir -p ./plugins/spark/v3.5/regtests/output && chmod -R 777 ./plugins/spark/v3.5/regtests/output +``` + +## Run Tests With Docker Compose + +Tests can be run with docker-compose using the provided `./plugins/spark/v3.5/regtests/docker-compose.yml` file, as +follows: + +```shell +./gradlew build +./gradlew \ + :polaris-quarkus-server:assemble \ + :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ + -Dquarkus.container-image.build=true +docker compose -f ./plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest +``` + +In this setup, a Polaris container will be started in a docker-compose group, using the image +previously built by the Gradle build. Then another container, including a Spark SQL shell, will run +the tests. The exit code will be the same as the exit code of the Spark container. +**NOTE** Docker compose only support testing with scala 2.12, because no scala 2.13 image is available +for spark 3.5. Scala 2.13 will be supported for Spark 4.0. + +This is the flow used in CI and should be done locally before pushing to GitHub to ensure that no +environmental factors contribute to the outcome of the tests. + +**Important**: if you are also using minikube, for example to test the Helm chart, you may need to +_unset_ the Docker environment that was pointing to the Minikube Docker daemon, otherwise the image +will be built by the Minikube Docker daemon and will not be available to the local Docker daemon. +This can be done by running, _before_ building the image and running the tests: + +```shell +eval $(minikube -p minikube docker-env --unset) +``` + +## Run Tests Locally + +Regression tests can be run locally as well, using the test harness. For local testing, both +Scala 2.12 and Scala 2.13 are supported. + +To run regression tests locally, run the following: +- `./gradlew build` -- build the Polaris project and Spark Client jars. +- `./gradlew run` -- start a Polaris server on localhost:8181. +- `env POLARIS_HOST=localhost ./plugins/spark/v3.5/regtests/run.sh` -- run regtests. + +Note: the regression tests expect Polaris to run with certain options, e.g. with support for `FILE` +storage, default realm `POLARIS` and root credentials `root:secret`; if you run the above command, +this will be the case. If you run Polaris in a different way, make sure that Polaris is configured +appropriately. diff --git a/plugins/spark/v3.5/regtests/docker-compose.yml b/plugins/spark/v3.5/regtests/docker-compose.yml new file mode 100755 index 000000000..e1ea1a898 --- /dev/null +++ b/plugins/spark/v3.5/regtests/docker-compose.yml @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +services: + polaris: + image: apache/polaris:latest + ports: + - "8181" + - "8182" + environment: + AWS_REGION: us-west-2 + POLARIS_BOOTSTRAP_CREDENTIALS: POLARIS,root,secret + quarkus.log.file.enable: "false" + quarkus.otel.sdk.disabled: "true" + healthcheck: + test: ["CMD", "curl", "http://localhost:8182/q/health"] + interval: 10s + timeout: 10s + retries: 5 + regtest: + build: + context: ../.. + dockerfile: v3.5/regtests/Dockerfile + args: + POLARIS_HOST: polaris + depends_on: + polaris: + condition: service_healthy + volumes: + - ./output:/tmp/polaris-regtests/ diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh new file mode 100755 index 000000000..d850a4465 --- /dev/null +++ b/plugins/spark/v3.5/regtests/run.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Run without args to run all tests. +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SPARK_ROOT_DIR=$(dirname ${SCRIPT_DIR}) +export SPARK_LOCAL_HOSTNAME=localhost # avoid VPN messing up driver local IP address binding + +FMT_RED='\033[0;31m' +FMT_GREEN='\033[0;32m' +FMT_NC='\033[0m' + +function loginfo() { + echo "$(date): ${@}" +} +function loggreen() { + echo -e "${FMT_GREEN}$(date): ${@}${FMT_NC}" +} +function logred() { + echo -e "${FMT_RED}$(date): ${@}${FMT_NC}" +} + +# Allow bearer token to be provided if desired +if [[ -z "$REGTEST_ROOT_BEARER_TOKEN" ]]; then + if ! output=$(curl -X POST -H "Polaris-Realm: POLARIS" "http://${POLARIS_HOST:-localhost}:8181/api/catalog/v1/oauth/tokens" \ + -d "grant_type=client_credentials" \ + -d "client_id=root" \ + -d "client_secret=secret" \ + -d "scope=PRINCIPAL_ROLE:ALL"); then + logred "Error: Failed to retrieve bearer token" + exit 1 + fi + + token=$(echo "$output" | awk -F\" '{print $4}') + + if [ "$token" == "unauthorized_client" ]; then + logred "Error: Failed to retrieve bearer token" + exit 1 + fi + + export REGTEST_ROOT_BEARER_TOKEN=$token +fi + +echo "Root bearer token: ${REGTEST_ROOT_BEARER_TOKEN}" + +NUM_FAILURES=0 + +SCALA_VERSIONS=("2.12" "2.13") +if [[ -n "$CURRENT_SCALA_VERSION" ]]; then + SCALA_VERSIONS=("${CURRENT_SCALA_VERSION}") +fi +SPARK_MAJOR_VERSION="3.5" +SPARK_VERSION="3.5.5" + +for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do + echo "RUN REGRESSION TEST FOR SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}, SPARK_VERSION=${SPARK_VERSION}, SCALA_VERSION=${SCALA_VERSION}" + # find the project jar + SPARK_DIR=${SPARK_ROOT_DIR}/spark + JAR_PATH=$(find ${SPARK_DIR} -name "polaris-iceberg-*.*-spark-runtime-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.jar" -print -quit) + echo "find jar ${JAR_PATH}" + + SPARK_EXISTS="TRUE" + if [ -z "${SPARK_HOME}" ]; then + SPARK_EXISTS="FALSE" + fi + + source ${SCRIPT_DIR}/setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH} + + # run the spark_sql test + loginfo "Starting test spark_sql.sh" + + TEST_FILE="spark_sql.sh" + TEST_SHORTNAME="spark_sql" + TEST_TMPDIR="/tmp/polaris-spark-regtests/${TEST_SHORTNAME}_${SPARK_MAJOR_VERSION}_${SCALA_VERSION}" + TEST_STDERR="${TEST_TMPDIR}/${TEST_SHORTNAME}.stderr" + TEST_STDOUT="${TEST_TMPDIR}/${TEST_SHORTNAME}.stdout" + + mkdir -p ${TEST_TMPDIR} + if (( ${VERBOSE} )); then + ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' | tee ${TEST_STDOUT} + else + ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' > ${TEST_STDOUT} + fi + loginfo "Test run concluded for ${TEST_SUITE}:${TEST_SHORTNAME}" + + TEST_REF="$(realpath ${SCRIPT_DIR})/${TEST_SHORTNAME}.ref" + if cmp --silent ${TEST_STDOUT} ${TEST_REF}; then + loggreen "Test SUCCEEDED: ${TEST_SUITE}:${TEST_SHORTNAME}" + else + logred "Test FAILED: ${TEST_SUITE}:${TEST_SHORTNAME}" + echo '#!/bin/bash' > ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + echo "meld ${TEST_STDOUT} ${TEST_REF}" >> ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + chmod 750 ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + logred "To compare and fix diffs (if 'meld' installed): ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh" + logred "Or manually diff: diff ${TEST_STDOUT} ${TEST_REF}" + logred "See stderr from test run for additional diagnostics: ${TEST_STDERR}" + diff ${TEST_STDOUT} ${TEST_REF} + NUM_FAILURES=$(( NUM_FAILURES + 1 )) + fi + + # clean up + if [ "${SPARK_EXISTS}" = "FALSE" ]; then + rm -rf ${SPARK_HOME} + export SPARK_HOME="" + fi +done + +# clean the output dir +rm -rf ${SCRIPT_DIR}/output + +loginfo "Tests completed with ${NUM_FAILURES} failures" +if (( ${NUM_FAILURES} > 0 )); then + exit 1 +else + exit 0 +fi diff --git a/plugins/spark/v3.5/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh new file mode 100755 index 000000000..072b08f6d --- /dev/null +++ b/plugins/spark/v3.5/regtests/setup.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +################################### +# Idempotent setup for spark regression tests. Run manually or let run.sh auto-run. +# +# Warning - first time setup may download large amounts of files +# Warning - may clobber conf/spark-defaults.conf +# Warning - it will set the SPARK_HOME environment variable with the spark setup +# +# The script can be called independently like following +# ./setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH} +# Required Parameters: +# --sparkVersion : the spark version to setup +# --scalaVersion : the scala version of spark to setup +# --jar : path to the local Polaris Spark client jar +# + +set -x + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +SPARK_VERSION=3.5.5 +SCALA_VERSION=2.12 +POLARIS_CLIENT_JAR="" +while [[ $# -gt 0 ]]; do + case "$1" in + --sparkVersion) + SPARK_VERSION="$2" + shift # past argument + shift # past value + ;; + --scalaVersion) + SCALA_VERSION="$2" + shift # past argument + shift # past value + ;; + --jar) + POLARIS_CLIENT_JAR="$2" + shift # past argument + shift # past value + ;; + --) shift; + break + ;; + esac +done + +echo "SET UP FOR SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION} POLARIS_CLIENT_JAR=${POLARIS_CLIENT_JAR}" + +if [ "$SCALA_VERSION" == "2.12" ]; then + SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3 +else + SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3-scala${SCALA_VERSION} +fi + +echo "Getting spark distribution ${SPARK_DISTRIBUTION}" + +if [ -z "${SPARK_HOME}" ]; then + SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION}) +fi +SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" +DERBY_HOME="/tmp/derby" + +# Ensure binaries are downloaded locally +echo 'Verifying Spark binaries...' +if ! [ -f ${SPARK_HOME}/bin/spark-sql ]; then + echo 'Setting up Spark...' + if [ -z "${SPARK_VERSION}" ] || [ -z "${SPARK_DISTRIBUTION}" ]; then + echo 'SPARK_VERSION or SPARK_DISTRIBUTION not set. Please set SPARK_VERSION and SPARK_DISTRIBUTION to the desired version.' + exit 1 + fi + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then + echo 'Downloading spark distro...' + wget -O ~/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then + if [[ "${OSTYPE}" == "darwin"* ]]; then + echo "Detected OS: mac. Running 'brew install wget' to try again." + brew install wget + wget -O ~/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + fi + fi + else + echo 'Found existing Spark tarball' + fi + # check if the download was successful + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then + echo 'Failed to download Spark distribution. Please check the logs.' + exit 1 + fi + tar xzvf ~/${SPARK_DISTRIBUTION}.tgz -C ~/${TEST_ROOT_DIR} + if [ $? -ne 0 ]; then + echo 'Failed to extract Spark distribution. Please check the logs.' + exit 1 + else + echo 'Extracted Spark distribution.' + rm ~/${SPARK_DISTRIBUTION}.tgz + fi + SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION}) + SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" +else + echo 'Verified Spark distro already installed.' +fi + +echo "SPARK_HOME=${SPARK_HOME}" +echo "SPARK_CONF=${SPARK_CONF}" + +# Ensure Spark boilerplate conf is set +echo 'Verifying Spark conf...' +if grep 'POLARIS_TESTCONF_V5' ${SPARK_CONF} 2>/dev/null; then + echo 'Verified spark conf' +else + echo 'Setting spark conf...' + # Instead of clobbering existing spark conf, just comment it all out in case it was customized carefully. + sed -i 's/^/# /' ${SPARK_CONF} +cat << EOF >> ${SPARK_CONF} + +# POLARIS Spark client test conf +spark.jars $POLARIS_CLIENT_JAR +spark.jars.packages io.delta:delta-spark_${SCALA_VERSION}:3.2.1 +spark.sql.variable.substitute true + +spark.driver.extraJavaOptions -Dderby.system.home=${DERBY_HOME} + +spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension +# this configuration is needed for delta table +spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog +spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog +spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog +# this configuration is currently only used for iceberg tables, generic tables currently +# don't support credential vending +spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation=vended-credentials +spark.sql.catalog.polaris.client.region=us-west-2 +# configuration required to ensure DataSourceV2 load works correctly for +# different table formats +spark.sql.sources.useV1SourceList='' +EOF + echo 'Success!' +fi + +# cleanup derby home if existed +if [ -d "${DERBY_HOME}" ]; then + echo "Directory ${DERBY_HOME} exists. Deleting it..." + rm -rf "${DERBY_HOME}" +fi + +echo "Launch spark-sql at ${SPARK_HOME}/bin/spark-sql" +# bootstrap dependencies so that future queries don't need to wait for the downloads. +# this is mostly useful for building the Docker image with all needed dependencies +${SPARK_HOME}/bin/spark-sql -e "SELECT 1" + +# ensure SPARK_HOME is setup for later tests +export SPARK_HOME=$SPARK_HOME diff --git a/plugins/spark/v3.5/regtests/spark_sql.ref b/plugins/spark/v3.5/regtests/spark_sql.ref new file mode 100755 index 000000000..5825d0931 --- /dev/null +++ b/plugins/spark/v3.5/regtests/spark_sql.ref @@ -0,0 +1,57 @@ +{"defaults":{"default-base-location":"file:///tmp/spark_catalog"},"overrides":{"prefix":"spark_sql_catalog"},"endpoints":["GET /v1/{prefix}/namespaces","GET /v1/{prefix}/namespaces/{namespace}","HEAD /v1/{prefix}/namespaces/{namespace}","POST /v1/{prefix}/namespaces","POST /v1/{prefix}/namespaces/{namespace}/properties","DELETE /v1/{prefix}/namespaces/{namespace}","GET /v1/{prefix}/namespaces/{namespace}/tables","GET /v1/{prefix}/namespaces/{namespace}/tables/{table}","HEAD /v1/{prefix}/ [...] +Catalog created +spark-sql (default)> use polaris; +spark-sql ()> create namespace db1; +spark-sql ()> create namespace db2; +spark-sql ()> show namespaces; +db1 +db2 +spark-sql ()> + > create namespace db1.schema1; +spark-sql ()> show namespaces in db1; +db1.schema1 +spark-sql ()> + > create table db1.schema1.iceberg_tb (col1 int); +spark-sql ()> show tables in db1; +spark-sql ()> show tables in db1.schema1; +iceberg_tb +spark-sql ()> + > use db1.schema1; +spark-sql (db1.schema1)> insert into iceberg_tb values (123), (234), (111); +spark-sql (db1.schema1)> select * from iceberg_tb order by col1; +111 +123 +234 +spark-sql (db1.schema1)> + > create table delta_tb1(col1 string) using delta location 'file:///tmp/spark_catalog/delta_tb1'; +spark-sql (db1.schema1)> insert into delta_tb1 values ('ab'), ('bb'), ('dd'); +spark-sql (db1.schema1)> select * from delta_tb1 order by col1; +ab +bb +dd +spark-sql (db1.schema1)> + > show tables; +iceberg_tb +delta_tb1 +spark-sql (db1.schema1)> + > use db1; +spark-sql (db1)> create table delta_tb2(col1 int) using delta location 'file:///tmp/spark_catalog/delta_tb2'; +spark-sql (db1)> insert into delta_tb2 values (1), (2), (3) order by col1; +spark-sql (db1)> select * from delta_tb2; +1 +2 +3 +spark-sql (db1)> + > show tables; +delta_tb2 +spark-sql (db1)> show tables in db1.schema1; +iceberg_tb +delta_tb1 +spark-sql (db1)> + > drop table db1.schema1.iceberg_tb; +spark-sql (db1)> drop table db1.schema1.delta_tb1; +spark-sql (db1)> drop namespace db1.schema1; +spark-sql (db1)> drop table delta_tb2; +spark-sql (db1)> drop namespace db1; +spark-sql (db1)> drop namespace db2; +spark-sql (db1)> diff --git a/plugins/spark/v3.5/regtests/spark_sql.sh b/plugins/spark/v3.5/regtests/spark_sql.sh new file mode 100755 index 000000000..fe036664c --- /dev/null +++ b/plugins/spark/v3.5/regtests/spark_sql.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN}" + +CATALOG_NAME="spark_sql_catalog" +curl -i -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ + -d '{"name": "spark_sql_catalog", "id": 100, "type": "INTERNAL", "readOnly": false, "properties": {"default-base-location": "file:///tmp/spark_catalog"}, "storageConfigInfo": {"storageType": "FILE", "allowedLocations": ["file:///tmp"]}}' > /dev/stderr + +# Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it can only manage access and metadata +curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${CATALOG_NAME}/catalog-roles/catalog_admin/grants \ + -d '{"type": "catalog", "privilege": "TABLE_WRITE_DATA"}' > /dev/stderr + +curl -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + "http://${POLARIS_HOST:-localhost}:8181/api/catalog/v1/config?warehouse=${CATALOG_NAME}" +echo +echo "Catalog created" +cat << EOF | ${SPARK_HOME}/bin/spark-sql -S --conf spark.sql.catalog.polaris.token="${SPARK_BEARER_TOKEN}" --conf spark.sql.catalog.polaris.warehouse=${CATALOG_NAME} +use polaris; +create namespace db1; +create namespace db2; +show namespaces; + +create namespace db1.schema1; +show namespaces in db1; + +create table db1.schema1.iceberg_tb (col1 int); +show tables in db1; +show tables in db1.schema1; + +use db1.schema1; +insert into iceberg_tb values (123), (234), (111); +select * from iceberg_tb order by col1; + +create table delta_tb1(col1 string) using delta location 'file:///tmp/spark_catalog/delta_tb1'; +insert into delta_tb1 values ('ab'), ('bb'), ('dd'); +select * from delta_tb1 order by col1; + +show tables; + +use db1; +create table delta_tb2(col1 int) using delta location 'file:///tmp/spark_catalog/delta_tb2'; +insert into delta_tb2 values (1), (2), (3) order by col1; +select * from delta_tb2; + +show tables; +show tables in db1.schema1; + +drop table db1.schema1.iceberg_tb; +drop table db1.schema1.delta_tb1; +drop namespace db1.schema1; +drop table delta_tb2; +drop namespace db1; +drop namespace db2; +EOF + +# clean up the spark_catalog dir +rm -rf /tmp/spark_catalog/ + +curl -i -X DELETE -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${CATALOG_NAME} > /dev/stderr diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index ddf27ce1f..5ce7e73c0 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -19,10 +19,7 @@ import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar -plugins { - id("polaris-client") - alias(libs.plugins.jandex) -} +plugins { id("polaris-client") } // get version information val sparkMajorVersion = "3.5" @@ -38,13 +35,45 @@ val scalaLibraryVersion = } dependencies { + // TODO: extract a polaris-rest module as a thin layer for + // client to depends on. implementation(project(":polaris-api-iceberg-service")) { // exclude the iceberg dependencies, use the ones pulled // by iceberg-core exclude("org.apache.iceberg", "*") + // exclude all cloud and quarkus specific dependencies to avoid + // running into problems with signature files. + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + implementation(project(":polaris-api-catalog-service")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + implementation(project(":polaris-core")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") } - implementation(project(":polaris-api-catalog-service")) - implementation(project(":polaris-core")) { exclude("org.apache.iceberg", "*") } implementation("org.apache.iceberg:iceberg-core:${icebergVersion}") @@ -127,8 +156,6 @@ tasks.register<ShadowJar>("createPolarisSparkJar") { "polaris-iceberg-${icebergVersion}-spark-runtime-${sparkMajorVersion}_${scalaVersion}" isZip64 = true - mergeServiceFiles() - // pack both the source code and dependencies from(sourceSets.main.get().output)